Create figures for the manuscript, extended abstract and poster.
packrat::init("/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeR")
## Initializing packrat project in directory:
## - "~/Projects/Scipher/SampleSize/scripts/SampleSizeR"
## Initialization complete!
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggalluvial)
## Loading required package: ggplot2
library(gghalves)
library(ggplot2)
library(ggnewscale)
require(ggrepel)
## Loading required package: ggrepel
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(gtable)
require(magrittr)
## Loading required package: magrittr
library(patchwork)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
## The following object is masked from 'package:igraph':
##
## crossing
set.seed(1510)
options(bitmapType='cairo')
`%ni%` <- Negate(`%in%`)
Un-normalized data:
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data'
plots_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots'
tables_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables'
numbers_file = paste(input_dir, 'dataset_numbers_complete_graph.txt', sep='/')
numbers_df = fread(numbers_file)
numbers_df$dataset = tolower(numbers_df$dataset)
meta_file = paste(input_dir, "metadata_summary.txt", sep="/")
meta_df = fread(meta_file)
sd_genes_file = paste(input_dir, "variation_by_gene.txt", sep="/") # from => calculate_rnaseq_datasets_variation.Rmd
sd_genes_df = fread(sd_genes_file)
name2color <- c(
"TCGA: Lung cancer" = "#56B4E9",
"tcga:tcga-luad" = "#56B4E9",
"tcga:tcga-luad-tumor" = "#56B4E9", # #D55E00
"TCGA: Breast cancer" = "#0072B2",
"TCGA: Breast c." = "#0072B2",
"tcga:tcga-brca_female" = "#0072B2",
"tcga:tcga-brca-tumor_female" = "#0072B2",
"breast neoplasms"="#0072B2",
"GTEx: Whole blood" = "#65F3BF",
"gtex:whole.blood" = "#65F3BF",
"GTEx: Lung" = "#24D157",
"gtex:lung" = "#24D157",
"GTEx: Breast" = "#00BA37", #44AA99
"gtex:breast.mammary.tissue" = "#00BA37",
"gtex:breast.mammary.tissue_female" = "#00BA37",
"R. arthritis" = "#D55E00", #E69F00
"scipher:scipher.sample.per.patient.baseline" = "#D55E00",
"arthritis rheumatoid"="#D55E00",
"asthma"="#d9f365",
"thyroid neoplasms"="#0072B2",
"tcga" = "#0072B2",
"TCGA" = "#0072B2",
"gtex" = "#00BA37",
"GTEx" = "#00BA37",
"scipher" = "#D55E00",
#"Analytical model"="#e3f705",
"Analytical model"="#cc68f7",
"Power law"="#cc68f7",
"Logarithm"="#09b863",
"Exponential decay"="#09b863",
"\u2265 0.2" = "#F8766D",
"\u2265 0.4" = "#7CAE00",
"\u2265 0.6" = "#00BFC4",
"\u2265 0.8" = "#C77CFF",
"Very Weak" = "#ff7b00",
"Weak" = "#F8766D",
"Moderate" = "#7CAE00",
"Strong" = "#00BFC4",
"Very Strong" = "#C77CFF",
"Convergence point" = "red",
"Model prediction" = "red",
"P-value < 0.05" = "red",
"common" = "#619CFF",
"disease-gene" = "#cc68f7",
"disease-specific" = "#F8766D",
"normal-specific" = "#00BA38",
"undefined" = "#808080",
"different" = "#C77CFF",
"all" = "#CD9600",
"common (constant)" = "#619CFF",
"common (change)" = "#b5d1ff",
"disease-specific (constant)" = "#F8766D",
"disease-specific (change)" = "#ffcbc7",
"normal-specific (constant)" = "#00BA38",
"normal-specific (change)" = "#a3d1b1",
"undefined (constant)" = "#808080",
"undefined (change)" = "#c2c2c2"
)
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, 'topology_results_pearson_pval_0.05.txt', sep='/')
results_selected_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, 'topology_results_mean_pearson_pval_0.05.txt', sep='/')
topology_results_selected_by_size_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, 'predictions_pearson_pval_0.05.txt', sep='/')
predicted_results_df = fread(predictions_file)
analytical_results_file = paste(input_dir, 'analytical_model_results_pearson_pval_0.05.txt', sep='/')
topology_results_selected_analytical_df = fread(analytical_results_file)
analytical_summary_file = paste(input_dir, 'analytical_model_summary_pearson_pval_0.05.txt', sep='/')
analytical_model_summary_df = fread(analytical_summary_file)
analytical_regression_results_file = paste(input_dir, 'analytical_model_regression_results_pearson_pval_0.05.txt', sep='/')
stretched_exponential_regression_df = fread(analytical_regression_results_file)
theoretical_sample_size_file = paste(input_dir, 'theoretical_sample_size_for_correlations_of_datasets.txt', sep='/') # from => calculate_convergence_correlation_types.Rmd
sample_size_correlation_df = fread(theoretical_sample_size_file)
Normalized data:
topology_type_normalization = "divide.L"
input_dir = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/example_pearson_pval_0.05'
topology_results_file = paste(input_dir, '/', 'topology_results_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
results_selected_norm_df = fread(topology_results_file)
topology_results_by_size_file = paste(input_dir, '/', 'topology_results_mean_norm_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_by_size_norm_df = fread(topology_results_by_size_file)
predictions_file = paste(input_dir, '/', 'predictions_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
predicted_results_norm_df = fread(predictions_file)
analytical_results_file = paste(input_dir, '/', 'analytical_model_results_', topology_type_normalization, '_pearson_pval_0.05.txt', sep='')
topology_results_selected_analytical_norm_df = fread(analytical_results_file)
Number of networks and maximum sample size:
nrow((results_selected_df %>%
dplyr::select(method, type_dataset, size, rep) %>%
unique()))
## [1] 2239
max(((results_selected_df %>%
dplyr::select(method, type_dataset, size, rep) %>%
unique()))$size)
## [1] 1080
Number of networks and maximum sample size without counting TCGA full dataset:
nrow((results_selected_df %>%
dplyr::select(method, type_dataset, size, rep) %>%
filter(!(type_dataset == "tcga:tcga")) %>%
unique()))
## [1] 2239
max(((results_selected_df %>%
dplyr::select(method, type_dataset, size, rep) %>%
filter(!(type_dataset == "tcga:tcga")) %>%
unique()))$size)
## [1] 1080
All datasets:
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
results_curve_df = results_selected_norm_df %>%
dplyr::filter(type_dataset %in% datasets_selected) %>%
dplyr::select(size, rep, unnorm, type_dataset) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "scipher:scipher.sample.per.patient.baseline",
"R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "gtex:whole.blood",
"GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "gtex:lung",
"GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "tcga:tcga-luad",
"TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "tcga:tcga-brca_female",
"TCGA: Breast cancer")) %>%
inner_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("type_dataset")),
by=c("type_dataset") ) %>%
mutate(geom_text_repel_label =
dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1,
"R. arthritis",
dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1,
"GTEx: Whole blood",
dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung",
dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1,
"GTEx: Breast",
dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1,
"TCGA: Lung cancer",
dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 600 & rep == 1,
"TCGA: Breast cancer", "")))))))
# # Plot with legend
# curve_plot = results_curve_df %>%
# ggplot(aes(x=size, y=unnorm, col=type_dataset)) +
# geom_point(alpha=0.5, size=3) +
# scale_color_manual(values = c(name2color["R. arthritis"][[1]], name2color["GTEx: Whole blood"][[1]], name2color["GTEx: Lung"][[1]], name2color["GTEx: Breast"][[1]], name2color["TCGA: Lung cancer"][[1]], name2color["TCGA: Breast cancer"][[1]])) +
# theme_linedraw() +
# xlab("Num. samples") +
# #xlab("log(N)") +
# ylab("Num. significant correlations") +
# guides(col=guide_legend(title="Dataset")) +
# theme(aspect.ratio=1,
# plot.title = element_text(size = 20, face="bold"),
# axis.title = element_text(size = 17, face="bold"),
# axis.text = element_text(size = 16),
# legend.text = element_text(size = 16),
# legend.title=element_text(size=16, face="bold"))
# Plot with labels
curve_plot = results_curve_df %>%
ggplot(aes(x=size, y=unnorm, label = geom_text_repel_label)) +
geom_point(aes(col=rgb_col), alpha=0.5, size=3) +
scale_colour_identity() +
theme_linedraw() +
xlab("Num. samples") +
ylab("Num. significant correlations") +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none"
) +
#geom_text_repel(aes(col=rgb_col), box.padding = 0.5, max.overlaps = Inf) +
geom_label_repel(aes(col=rgb_col),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.9,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234)
plot_file = paste(plots_dir, "curve_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
plot=curve_plot,
dpi = 1200,
#width = 10000,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(curve_plot)
Less datasets:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"tcga:tcga-brca_female")
results_curve_lessdatasets_df = results_selected_norm_df %>%
dplyr::filter(type_dataset %in% datasets_selected) %>%
dplyr::select(size, rep, unnorm, type_dataset) %>%
mutate(type_dataset =
replace(type_dataset,
type_dataset == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
mutate(type_dataset =
replace(type_dataset,
type_dataset == "tcga:tcga-brca_female",
"TCGA: Breast cancer")) %>%
inner_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("type_dataset")),
by = c("type_dataset") ) %>%
mutate(geom_text_repel_label =
dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1,
"R. arthritis",
dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1,
"GTEx: Whole blood",
dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung",
dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1,
"GTEx: Breast",
dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1,
"TCGA: Lung cancer",
dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 600 & rep == 1,
"TCGA: Breast cancer", "")))))))
# Plot with labels
curve_plot_lessdatasets = results_curve_lessdatasets_df %>%
ggplot(aes(x=size, y=unnorm, label = geom_text_repel_label)) +
geom_point(aes(col=rgb_col), alpha=0.5, size=3) +
scale_colour_identity() +
theme_linedraw() +
xlab("Num. samples") +
ylab("Num. significant correlations") +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none"
) +
#geom_text_repel(aes(col=rgb_col), box.padding = 0.5, max.overlaps = Inf) +
geom_label_repel(aes(col=rgb_col),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.9,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234)
plot_file = paste(plots_dir, "curve_pearson_pval_0.05_lessdatasets.png", sep="/")
ggsave(
plot_file,
plot=curve_plot_lessdatasets,
dpi = 1200,
#width = 10000,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(curve_plot_lessdatasets)
Calculate scaling relation plot:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
# Calculate data from scaling relationship
cols = c("size",
"S",
"type_dataset",
"S_lag1",
"Fn",
"slope",
"intercept",
"adj.r.squared")
scaling_relation_df = data.frame(matrix(ncol = length(cols),
nrow = 0,
dimnames = list(NULL, cols)))
for (dataset_selected in datasets_selected){
scaling_relation_filt = topology_results_selected_by_size_df %>%
filter(type_dataset == dataset_selected) %>%
select(size, mean, type_dataset) %>%
unique() %>%
rename("S"="mean") %>%
arrange(size) %>%
mutate(S_lag1 = if_else(size == min(size), 0, lag(S))) %>%
mutate(Fn = ((S - S_lag1)/S_lag1)) %>%
filter((!(is.infinite(Fn))) & (Fn > 0))
lm_summary = summary(lm(log(scaling_relation_filt$Fn) ~ log(scaling_relation_filt$size)))
slope = coef(lm_summary)[2]
intercept = coef(lm_summary)[1]
adj.r.squared = lm_summary$adj.r.squared
scaling_relation_df = rbind(scaling_relation_df,
cbind(scaling_relation_filt,
data.frame(slope = slope,
intercept = intercept,
adj.r.squared = adj.r.squared)))
}
# Process results
scaling_relation_df = scaling_relation_df %>%
filter(Fn > 0) %>%
mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
mutate_at(c('adj.r.squared'), as.character) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 160, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 160, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 120, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 600, "TCGA: Breast cancer", "")))))))
# Plot with legend
# scaling_relation_plot = scaling_relation_df %>%
# ggplot(aes(x=log(size), y=log(Fn), col=adj.r.squared)) +
# geom_point(alpha=0.5, size=3) +
# geom_abline(aes(intercept=intercept, slope=slope, col=adj.r.squared), size=1) +
# theme_linedraw() +
# xlab("log(n)") +
# ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
# #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
# #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
# scale_color_manual(values = c("#56B4E9", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#D55E00")) +
# guides(col=guide_legend(title=bquote(bold(R^2)))) +
# theme(aspect.ratio=1,
# plot.title = element_text(size = 20, face="bold"),
# axis.title = element_text(size = 17, face="bold"),
# axis.text = element_text(size = 16),
# legend.position = c(0.895, 0.77),
# legend.text = element_text(size = 11),
# legend.title=element_text(size=13, face="bold"),
# legend.title.align=0.5,
# legend.background = element_rect(size=0.2, linetype="solid", colour ="black"))
# Plot with labels
scaling_relation_plot = scaling_relation_df %>%
ggplot(aes(x=log(size), y=log(Fn), col=rgb_col)) +
geom_point(alpha=0.5, size=3) +
geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
theme_linedraw() +
xlab("log(n)") +
ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
scale_colour_identity() +
guides(col=guide_legend(title=bquote(bold(R^2)))) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
plot_file = paste(plots_dir, "scaling_relation_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
plot=scaling_relation_plot,
dpi = 1200,
#width = 9500,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(scaling_relation_plot)
Less datasets:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"tcga:tcga-brca_female")
# Calculate data from scaling relationship
cols = c("size",
"S",
"type_dataset",
"S_lag1",
"Fn",
"slope",
"intercept",
"adj.r.squared")
scaling_relation_lessdatasets_df = data.frame(matrix(ncol = length(cols),
nrow = 0,
dimnames = list(NULL, cols)))
for (dataset_selected in datasets_selected){
scaling_relation_filt = topology_results_selected_by_size_df %>%
filter(type_dataset == dataset_selected) %>%
select(size, mean, type_dataset) %>%
unique() %>%
rename("S" = "mean") %>%
arrange(size) %>%
mutate(S_lag1 = if_else(size == min(size), 0, lag(S))) %>%
mutate(Fn = ((S - S_lag1) / S_lag1)) %>%
filter((!(is.infinite(Fn))) & (Fn > 0))
lm_summary = summary(lm(log(scaling_relation_filt$Fn)~log(scaling_relation_filt$size)))
slope = coef(lm_summary)[2]
intercept = coef(lm_summary)[1]
adj.r.squared = lm_summary$adj.r.squared
scaling_relation_lessdatasets_df = rbind(scaling_relation_lessdatasets_df,
cbind(scaling_relation_filt,
data.frame(slope = slope,
intercept = intercept,
adj.r.squared = adj.r.squared)))
}
# Process results
scaling_relation_lessdatasets_df = scaling_relation_lessdatasets_df %>%
filter(Fn > 0) %>%
mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
mutate_at(c('adj.r.squared'), as.character) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "tcga:tcga-brca_female",
"TCGA: Breast cancer")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
inner_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("type_dataset")),
by=c("type_dataset") ) %>%
#mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 220, "Power law", ""))
mutate(geom_text_repel_label =
dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140, "GTEx: Breast",
dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 400, "TCGA: Breast cancer", "")))
scaling_relation_lessdatasets_df$r2labels =
ifelse(scaling_relation_lessdatasets_df$type_dataset == "GTEx: Breast",
paste("GTEx Breast: ",
unique((scaling_relation_lessdatasets_df %>%
filter(type_dataset == "GTEx: Breast"))$adj.r.squared),
sep=""),
ifelse(scaling_relation_lessdatasets_df$type_dataset == "TCGA: Breast cancer",
paste("TCGA Breast c.: ",
unique((scaling_relation_lessdatasets_df %>%
filter(type_dataset == "TCGA: Breast cancer"))$adj.r.squared),
sep=""),
""))
# Plot with labels
scaling_relation_plot_lessdatasets = scaling_relation_lessdatasets_df %>%
ggplot(aes(x=log(size), y=log(Fn), col=r2labels)) +
geom_point(alpha=0.9, size=3) +
#geom_abline(aes(intercept=intercept, slope=slope), col="#cc68f7", size=1) +
geom_abline(aes(intercept=intercept, slope=slope, col=r2labels), size=1, key_glyph = "smooth") +
theme_linedraw() +
xlab("log(n)") +
ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
scale_color_manual(guide = "legend", values=as.vector(name2color[levels(factor(scaling_relation_lessdatasets_df$type_dataset))])) +
#scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_lessdatasets_df$r2labels))) +
guides(col = guide_legend(title=bquote(bold(.('Power law') ~ R^2)))) +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
#legend.position="none") +
legend.position = c(0.64, 0.86),
legend.text = element_text(size = 16),
legend.title = element_text(size=16, face="bold")
#legend.background = element_rect(linewidth=0.2, linetype="solid", colour ="black"))
)
#geom_label_repel(aes(label=geom_text_repel_label),
#col="#cc68f7",
# geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
# fill="white",
# box.padding = 0.5, max.overlaps = Inf,
# label.size = 0.75,
# size = 5,
# alpha = 1,
# label.padding=0.25,
# fontface = "bold",
# na.rm=TRUE,
# seed = 1234)
plot_file = paste(plots_dir,
"scaling_relation_pearson_pval_0.05_lessdatasets.png",
sep="/")
ggsave(
plot_file,
plot=scaling_relation_plot_lessdatasets,
dpi = 1200,
#width = 9500,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(scaling_relation_plot_lessdatasets)
The plots of two datasets separated:
datasets_selected <- list("gtex:breast.mammary.tissue_female" = "GTEx: Breast",
"tcga:tcga-brca_female" = "TCGA: Breast cancer")
y_bot_limit <- -6
y_top_limit <- 3.5
x_bot_limit <- 3.5
x_top_limit <- 7.0
x_annotation <- 5.75
y_annotation <- 2.0
scaling_relation_breast_df <- scaling_relation_lessdatasets_df %>%
filter(type_dataset == "GTEx: Breast")
scaling_relation_plot_breast <- scaling_relation_breast_df %>%
ggplot(aes(x=log(size), y=log(Fn), col=rgb_col)) +
geom_point(alpha=0.9, size=3) +
geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
annotate("text",
x = x_annotation,
y = y_annotation,
label = paste("GTEx:\nBreast\nR² =",
round(as.numeric(unique(scaling_relation_breast_df$adj.r.squared)),
3)),
size = 5) +
theme_linedraw() +
xlab("log(n)") +
ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
xlim(x_bot_limit, x_top_limit) +
ylim(y_bot_limit, y_top_limit) +
scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_breast_df$r2labels))) +
guides(col=guide_legend(title=bquote(bold(.('Power law') ~ R^2)))) +
theme(aspect.ratio=2/1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
#axis.title.x = element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position = "none")
scaling_relation_brcancer_df <- scaling_relation_lessdatasets_df %>%
filter(type_dataset == "TCGA: Breast cancer")
scaling_relation_plot_brcancer <- scaling_relation_brcancer_df %>%
ggplot(aes(x=log(size), y=log(Fn), col=rgb_col)) +
geom_point(alpha=0.9, size=3) +
geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
annotate("text",
x = x_annotation,
y = y_annotation,
label = paste("TCGA:\nBreast c.\nR² =",
round(as.numeric(unique(scaling_relation_brcancer_df$adj.r.squared)),
3)),
size = 5) +
theme_linedraw() +
xlab("log(n)") +
ylab(bquote(bold(log((S[n]-S[n-1])/S[n-1])))) +
xlim(x_bot_limit, x_top_limit) +
ylim(y_bot_limit, y_top_limit) +
scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_brcancer_df$r2labels))) +
guides(col=guide_legend(title=bquote(bold(.('Power law') ~ R^2)))) +
theme(aspect.ratio=2/1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
#axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position = "none")
# layout <- "AB"
#
# ((scaling_relation_plot_breast +
# theme(plot.margin = margin(0.6, 0.1, 0.6, 0.6, "cm"))) +
# (scaling_relation_plot_brcancer +
# theme(plot.margin = margin(0.6, 0.6, 0.6, 0.1, "cm")))) +
# xlab("log(n)") +
# plot_layout(design = layout) &
# theme(element_text(face = 'bold', size = 17))
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
# Process data
stretched_exponential_regression_proc_df = stretched_exponential_regression_df %>% inner_join((topology_results_selected_analytical_df %>% dplyr::select(type_dataset, model, slope, intercept, a, b) %>% unique()), by=c("type_dataset", "model")) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") )
regression_plot = stretched_exponential_regression_proc_df %>%
ggplot(aes(x=x, y=y, col=rgb_col)) +
geom_point(alpha=0.5, size=3) +
geom_abline(aes(intercept=intercept, slope=slope, col=rgb_col), size=1) +
theme_linedraw() +
xlab("log(n)") +
ylab("log[ log(L) - log(S) ]") +
#scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
#scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
#scale_color_manual(values = c("#56B4E9", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#D55E00")) +
scale_colour_identity(labels = unique(stretched_exponential_regression_proc_df$type_dataset), breaks = unique(stretched_exponential_regression_proc_df$rgb_col), guide = "legend") +
guides(col=guide_legend(title="Dataset")) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica"))
plot_file = paste(plots_dir, "regression_pearson_pval_0.05.png", sep="/")
ggsave(
plot_file,
plot=regression_plot,
dpi = 1200,
#width = 9500,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(regression_plot)
Create a table with the results for the figure:
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
# Show information for scaling relation
scaling_relation_summary_df = scaling_relation_df %>%
dplyr::select(type_dataset, adj.r.squared) %>%
unique() %>%
arrange(factor(type_dataset, levels = datasets_selected)) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
rename("R**2" = "adj.r.squared") %>%
mutate_if(is.numeric, ~sprintf("%.2f",.))
print(scaling_relation_summary_df)
## type_dataset R**2
## 1: GTEx: Breast 0.94
## 2: GTEx: Lung 0.65
## 3: GTEx: Whole blood 0.70
## 4: TCGA: Breast cancer 0.69
## 5: TCGA: Lung cancer 0.90
## 6: R. arthritis 0.88
# Show all information for model selected
analytical_model_summary_df <- analytical_model_summary_df %>%
inner_join((results_selected_norm_df %>%
select("type_dataset", "subclassification")),
by = c("type_dataset")) %>%
unique()
analytical_model_summary_df$L_vs_total = abs((analytical_model_summary_df$unnorm_L - analytical_model_summary_df$total_num_edges)) / analytical_model_summary_df$total_num_edges
analytical_model_summary_df$dataset_name = analytical_model_summary_df$type_dataset
analytical_model_summary_df =
analytical_model_summary_df %>%
separate(col = "dataset_name",
into = c("dataset_name", "sex"),
sep = "_",
remove = TRUE) %>%
separate(col = "dataset_name",
into = c("dataset", NULL),
sep = ":",
remove = FALSE) %>%
mutate(sex = if_else(is.na(sex), "", sex))
## Warning: Expected 2 pieces. Additional pieces discarded in 24 rows [20, 21, 22, 42, 43,
## 44, 64, 65, 66, 86, 87, 88, 108, 109, 110, 130, 131, 132, 152, 153, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 128 rows [1, 2, 3, 5, 6,
## 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 23, 24, 25, 27, ...].
## Warning: Expected 1 pieces. Additional pieces discarded in 176 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
analytical_model_summary_df$dataset_name =
ifelse(!(analytical_model_summary_df$subclassification == ""),
paste(analytical_model_summary_df$dataset_name,
analytical_model_summary_df$subclassification,
sep = "-"),
analytical_model_summary_df$dataset_name)
analytical_model_summary_df$dataset_name =
ifelse(!(analytical_model_summary_df$sex == ""),
paste(analytical_model_summary_df$dataset_name,
analytical_model_summary_df$sex,
sep = "_"),
analytical_model_summary_df$dataset_name)
analytical_model_summary_df %>%
dplyr::select(type_dataset, model, a, b, L, L_vs_total, adj.r.squared, relative.error.mean) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique() %>%
arrange(factor(type_dataset, levels = datasets_selected, datasets_selected))
## type_dataset
## 1 gtex:breast.mammary.tissue_female
## 2 gtex:lung
## 3 gtex:whole.blood
## 4 tcga:tcga-brca_female
## 5 tcga:tcga-luad
## 6 scipher:scipher.sample.per.patient.baseline
## model a b L L_vs_total
## 1 Stretched exponential (by linear fit) 1.811947 64.89496 3.885985 0
## 2 Stretched exponential (by linear fit) 1.654318 28.07262 1.964261 0
## 3 Stretched exponential (by linear fit) 1.650687 21.33857 1.586486 0
## 4 Stretched exponential (by linear fit) 1.478753 17.41578 3.599061 0
## 5 Stretched exponential (by linear fit) 1.649890 37.43779 2.749527 0
## 6 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343 0
## adj.r.squared relative.error.mean
## 1 0.9963943 0.30863165
## 2 0.9838666 0.28316122
## 3 0.9832295 0.16228112
## 4 0.9974590 0.06618629
## 5 0.9979574 0.08379174
## 6 0.9908833 0.13866336
# Show all information for logarithmic
log_model_summary_df = analytical_model_summary_df %>%
dplyr::select(type_dataset, model, a, b, L, L_vs_total, adj.r.squared, relative.error.mean) %>%
filter((type_dataset %in% datasets_selected) & (model == "Logarithmic")) %>% unique() %>%
dplyr::select(type_dataset, adj.r.squared, relative.error.mean) %>%
rename("R**2" = "adj.r.squared", "epsilon" = "relative.error.mean") %>%
arrange(factor(type_dataset, levels = datasets_selected)) %>%
mutate_if(is.numeric, ~sprintf("%.2f",.))
print(log_model_summary_df)
## type_dataset R**2 epsilon
## 1 gtex:breast.mammary.tissue_female 0.88 8.83
## 2 gtex:lung 0.95 10.44
## 3 gtex:whole.blood 0.99 1.15
## 4 tcga:tcga-brca_female 0.88 7.94
## 5 tcga:tcga-luad 0.89 12.89
## 6 scipher:scipher.sample.per.patient.baseline 0.95 2.93
# Create table for figure
power_law_summary_df = analytical_model_summary_df %>%
dplyr::select(type_dataset, model, a, adj.r.squared, relative.error.mean) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
dplyr::select(-model) %>%
arrange(factor(type_dataset, levels = datasets_selected)) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
rename("alpha"="a", "R**2" = "adj.r.squared", "epsilon" = "relative.error.mean") %>%
unique() %>%
mutate_if(is.numeric, ~sprintf("%.2f",.))
print(power_law_summary_df)
## type_dataset alpha R**2 epsilon
## 1 GTEx: Breast 1.81 1.00 0.31
## 2 GTEx: Lung 1.65 0.98 0.28
## 3 GTEx: Whole blood 1.65 0.98 0.16
## 4 TCGA: Breast cancer 1.48 1.00 0.07
## 5 TCGA: Lung cancer 1.65 1.00 0.08
## 6 R. arthritis 1.76 0.99 0.14
# Bind power law with logarithm results
#figure_summary_table = cbind(power_law_summary_df, (log_model_summary_df %>% select(-type_dataset)))
figure_summary_table = cbind(scaling_relation_summary_df, (power_law_summary_df %>% select(-type_dataset)), (log_model_summary_df %>% select(-type_dataset)))
# Assign first row as row names
rownames(figure_summary_table) <- figure_summary_table$type_dataset
figure_summary_table$type_dataset <- NULL
# Customize and print table
name2colorlight <- c(
"TCGA: Lung cancer" = "#c0e5fa",
"TCGA: Breast cancer" = "#2bb3ff",
"breast neoplasms"="#2bb3ff",
"GTEx: Whole blood" = "#c3f7e4",
"GTEx: Lung" = "#78ffa0",
"GTEx: Breast" = "#47d671",
"R. arthritis" = "#ffd6b5"
)
colors_original_figure = as.vector(name2color[unique(rownames(figure_summary_table))])
colors_original_figure = c(tail(colors_original_figure, 1), head(colors_original_figure, -1)) # I don't know why, the color in fg_params starts from 2nd position, so I have to reorder the vector of colors in order to match the other colors
colors_light_figure = as.vector(name2colorlight[unique(rownames(figure_summary_table))])
core_figure = gridExtra::tableGrob(figure_summary_table, theme=ttheme_minimal(
colhead=list(bg_params = list(fill = NA, col="black", lwd=1),
fg_params = list(parse=TRUE, fontface="bold", fontsize=18)),
rowhead=list(bg_params = list(fill = NA, col=NA),
fg_params=list(col="black", fontface="plain", fontsize=18)),
core=list(bg_params=list(fill=colors_light_figure, col="black", lwd=1), fg_params=list(fontsize=18))))
#header <- gridExtra::tableGrob(figure_summary_table[1, 1:2], rows=NULL, cols=c("Power law", "Logarithm"), theme=ttheme_minimal(
header <- gridExtra::tableGrob(figure_summary_table[1, 1:3], rows=NULL, cols=c("Power law", "Analytical model", "Logarithm"), theme=ttheme_minimal(
colhead=list(bg_params = list(fill = NA, col="black", lwd=1),
fg_params = list(parse=TRUE, fontface="plain", fontsize=18))))
figure_summary_table_plot <- gtable_combine(header[1,], core_figure, along=2)
#figure_summary_table_plot$widths[2:length(figure_summary_table_plot$widths)] <- rep(max(figure_summary_table_plot$widths[2:length(figure_summary_table_plot$widths)]), length(figure_summary_table_plot$widths[2:length(figure_summary_table_plot$widths)])) # make column widths equal
figure_summary_table_plot$widths[3:length(figure_summary_table_plot$widths)] <- rep(max(figure_summary_table_plot$widths[3:length(figure_summary_table_plot$widths)]), length(figure_summary_table_plot$widths[3:length(figure_summary_table_plot$widths)])) # make columns 3 to end widths equal
figure_summary_table_plot$widths[2] <- rep(max(figure_summary_table_plot$widths[2])*0.7, length(figure_summary_table_plot$widths[2])) # make column 2 smaller
#grid.newpage()
#grid.draw(figure_summary_table_plot) # see what it looks like before altering gtable
# change the relevant rows of gtable
grid.newpage()
#figure_summary_table_plot$layout[1:4 , c("l", "r")] <- list(c(2, 7), c(6, 8)) # For 8 columns (including b and L)
#figure_summary_table_plot$layout[1:4 , c("l", "r")] <- list(c(2, 5), c(4, 6))
figure_summary_table_plot$layout[1:6 , c("l", "r")] <- list(c(2, 3, 6), c(2, 5, 7))
grid.draw(figure_summary_table_plot)
All datasets:
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
# Process data
prediction_plot_df = results_selected_norm_df %>%
dplyr::right_join((predicted_results_norm_df %>% dplyr::select(type_dataset, model, model_result, size) %>% mutate_at(c('model_result'), as.numeric) %>% unique()), by=c("type_dataset", "size")) %>%
dplyr::filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
dplyr::select(size, rep, type_dataset, num_edges, model_result) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher:scipher.sample.per.patient.baseline", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:whole.blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:lung", "GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-luad", "TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 800 & rep == 1, "TCGA: Breast cancer", "")))))))
# # Plot with legend
# prediction_plot = prediction_plot_df %>%
# ggplot(aes(x=size, y=num_edges, col=type_dataset)) +
# geom_point(alpha=0.5, size=3) +
# #geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
# geom_line(aes(x=size, y=model_result, col=type_dataset), size=1) +
# scale_x_continuous(trans = scales::log10_trans()) +
# #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
# #scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
# #scale_color_manual(values = c("#56B4E9", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#D55E00")) +
# scale_color_manual(values = as.vector(name2color[levels(factor(prediction_plot_df$type_dataset))])) + # Plot using the dictionary
# theme_linedraw() +
# xlab("Num. samples (Log)") +
# ylab("Frac. significant correlations") +
# guides(col=guide_legend(title="Dataset")) +
# theme(aspect.ratio=1,
# plot.title = element_text(size = 20, face="bold"),
# axis.title = element_text(size = 17, face="bold"),
# axis.text = element_text(size = 16),
# legend.text = element_text(size = 16),
# legend.title=element_text(size=16, face="bold"))
# Plot with labels
prediction_plot = prediction_plot_df %>%
ggplot(aes(x=log10(size), y=num_edges, col=rgb_col)) +
geom_point(alpha=0.5, size=3) +
#geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
geom_line(aes(x=log10(size), y=model_result, col=rgb_col), size=1) +
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = as.vector(name2color[levels(factor(prediction_plot_df$type_dataset))])) + # Plot using the dictionary
scale_colour_identity() +
theme_linedraw() +
xlab("Num. samples (Log10)") +
ylab("Frac. significant correlations") +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.9,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234)
plot_file = paste(plots_dir, "prediction_pearson_pval_0.05.png", sep="/")
print(prediction_plot)
## Warning: Removed 29835 rows containing missing values (`geom_point()`).
ggsave(
plot_file,
plot=prediction_plot,
dpi = 1200,
#width = 9500,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 29835 rows containing missing values (`geom_point()`).
Less datasets:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"tcga:tcga-brca_female")
model_selected = "Stretched exponential (by linear fit)"
# Process data
prediction_plot_lessdatasets_df = results_selected_norm_df %>%
dplyr::right_join((predicted_results_norm_df %>%
dplyr::select(type_dataset, model, model_result, size) %>%
mutate_at(c('model_result'), as.numeric) %>%
unique()), by=c("type_dataset", "size")) %>%
dplyr::filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
dplyr::select(size, rep, type_dataset, num_edges, model_result) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast c.")) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast c." & size == 1000 & rep == 1, "TCGA: Breast c.", "")))))))
# Plot with labels
prediction_plot_lessdatasets = prediction_plot_lessdatasets_df %>%
ggplot(aes(x=log10(size), y=num_edges, col=rgb_col)) +
geom_point(alpha=0.5, size=3) +
#geom_line(data=(predicted_results_norm_df %>% filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>% unique()), aes(x=size, y=model_result, col=type_dataset), size=1) +
geom_line(aes(x=log10(size), y=model_result, col=rgb_col), size=1) +
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = as.vector(name2color[levels(factor(prediction_plot_lessdatasets_df$type_dataset))])) + # Plot using the dictionary
#scale_colour_identity() +
scale_colour_identity(labels = unique(prediction_plot_lessdatasets_df$type_dataset), breaks = unique(prediction_plot_lessdatasets_df$rgb_col), guide = "legend") +
theme_linedraw() +
xlab("Num. samples (Log10)") +
ylab("Frac. significant correlations") +
guides(col=guide_legend(title = NULL)) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.position = c(0.28, 0.92),
legend.text = element_text(size = 16),
legend.title = element_text(size = 16, face="bold"),
#legend.position="none"),
legend.background = element_rect(fill = 'transparent', color = "transparent"),
text = element_text(family = "Helvetica")) #+
# geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
# fill="white",
# box.padding = 0.5, max.overlaps = Inf,
# label.size = NA,
# size = 5,
# alpha = 0.9,
# label.padding=0.25,
# fontface = "bold",
# na.rm=TRUE,
# seed = 1234)
plot_file = paste(plots_dir, "prediction_pearson_pval_0.05_lessdatasets.png", sep="/")
print(prediction_plot_lessdatasets)
## Warning: Removed 9938 rows containing missing values (`geom_point()`).
ggsave(
plot_file,
plot=prediction_plot_lessdatasets,
dpi = 1200,
#width = 9500,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 9938 rows containing missing values (`geom_point()`).
Number of samples for a given % of significant edges:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
model_selected = "Stretched exponential (by linear fit)"
predicted_results_norm_df %>% dplyr::select(type_dataset, model, model_result, size) %>%
mutate_at(c('model_result'), as.numeric) %>%
filter((type_dataset %in% datasets_selected) & (model == model_selected)) %>%
group_by(type_dataset) %>%
filter((abs(model_result - 0.25) == min(abs(model_result - 0.25))) | (abs(model_result - 0.5) == min(abs(model_result - 0.5))) | (abs(model_result - 0.75) == min(abs(model_result - 0.75))))
## # A tibble: 18 × 4
## # Groups: type_dataset [6]
## type_dataset model model…¹ size
## <chr> <chr> <dbl> <int>
## 1 scipher:scipher.sample.per.patient.baseline Stretched exponent… 0.252 130
## 2 scipher:scipher.sample.per.patient.baseline Stretched exponent… 0.499 320
## 3 scipher:scipher.sample.per.patient.baseline Stretched exponent… 0.749 1020
## 4 gtex:breast.mammary.tissue_female Stretched exponent… 0.255 150
## 5 gtex:breast.mammary.tissue_female Stretched exponent… 0.503 350
## 6 gtex:breast.mammary.tissue_female Stretched exponent… 0.750 1020
## 7 gtex:lung Stretched exponent… 0.250 190
## 8 gtex:lung Stretched exponent… 0.501 550
## 9 gtex:lung Stretched exponent… 0.750 2100
## 10 gtex:whole.blood Stretched exponent… 0.251 130
## 11 gtex:whole.blood Stretched exponent… 0.503 380
## 12 gtex:whole.blood Stretched exponent… 0.750 1450
## 13 tcga:tcga-brca_female Stretched exponent… 0.250 920
## 14 tcga:tcga-brca_female Stretched exponent… 0.500 3910
## 15 tcga:tcga-brca_female Stretched exponent… 0.750 24570
## 16 tcga:tcga-luad Stretched exponent… 0.250 310
## 17 tcga:tcga-luad Stretched exponent… 0.500 900
## 18 tcga:tcga-luad Stretched exponent… 0.750 3480
## # … with abbreviated variable name ¹model_result
Comparison of stretched exponential with logarithm:
models_selected = c("Stretched exponential (by linear fit)", "Logarithmic", "Mean")
#dataset_selected = "gtex:whole.blood"
dataset_selected = "tcga:tcga-brca_female"
comparison_palette = name2color[c(dataset_selected, "Analytical model", "Logarithm")]
# Join predicted results with mean
predicted_results_norm_and_mean_df = rbind((topology_results_selected_by_size_norm_df %>% dplyr::select(type_dataset, size, mean_norm) %>% unique() %>% rename("model_result" = "mean_norm") %>% mutate(model="Mean")), (predicted_results_norm_df %>% dplyr::select(type_dataset, size, model_result, model) %>% unique() %>% mutate_at(c('model_result'), as.numeric))) %>%
filter(model %in% models_selected) %>%
mutate(model = replace(model, model == "Stretched exponential (by linear fit)", "Analytical model")) %>%
mutate(model = replace(model, model == "Logarithmic", "Logarithm"))
predicted_results_norm_and_mean_df$model <- factor(predicted_results_norm_and_mean_df$model, levels = c("Mean", "Analytical model", "Logarithm"))
# Process data
comparison_log_df = results_selected_norm_df %>%
dplyr::select(type_dataset, size, rep, num_edges) %>%
right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
filter(type_dataset == dataset_selected) %>%
filter(size <= max((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
filter(size >= min((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("model") %>% mutate(model = replace(model, model == dataset_selected, "Mean"))), by=c("model") ) %>%
mutate(geom_text_repel_label=dplyr::if_else(model == "Logarithm" & size == 280 & rep == 1, "Logarithm", dplyr::if_else(model == "Analytical model" & size == 760 & rep == 1, "Analytical model", dplyr::if_else(model == "Mean" & size == 560 & rep == 1, "Mean", ""))))
# # Plot with legend
# comparison_plot = comparison_log_df %>%
# ggplot(aes(x=size, y=num_edges, col=model)) +
# geom_point(alpha=0.1, size=3, col=comparison_palette[1]) +
# geom_line(aes(x=size, y=model_result, col=model), size=1) +
# #scale_x_continuous(trans = scales::log10_trans()) +
# scale_color_manual(values = comparison_palette) +
# theme_linedraw() +
# xlab("Num. samples") +
# ylab("Frac. significant correlations") +
# #guides(col=guide_legend(title="Model")) +
# theme(aspect.ratio=1,
# plot.title = element_text(size = 20, face="bold"),
# axis.title = element_text(size = 17, face="bold"),
# axis.text = element_text(size = 16),
# legend.text = element_text(size = 16),
# #legend.title=element_text(size=16, face="bold"),
# legend.title=element_blank(),
# legend.position="bottom")
# Plot with labels
comparison_plot = comparison_log_df %>%
ggplot(aes(x=size, y=num_edges)) +
geom_point(alpha=0.1, size=3, col=comparison_palette[[dataset_selected]]) +
geom_line(aes(x=size, y=model_result, col=rgb_col), size=1) +
scale_colour_identity() +
theme_linedraw() +
xlab("Num. samples") +
ylab("Frac. significant correlations") +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
geom_label_repel(aes(x=size, y=model_result, col=rgb_col, label=geom_text_repel_label),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 1,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234)
plot_file = paste(plots_dir, "/comparison_models_pearson_pval_0.05_", dataset_selected, ".png", sep="")
print(comparison_plot)
## Warning: Removed 106 rows containing missing values (`geom_point()`).
ggsave(
plot_file,
plot=comparison_plot,
dpi = 1200,
#width = 6600,
#height = 6700,
units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 106 rows containing missing values (`geom_point()`).
Comparison of power law with exponential decay:
datasets_selected = c("tcga:tcga-brca_female")
# Calculate data from scaling relationship
cols = c("size",
"S",
"type_dataset",
"S_lag1",
"Fn",
"model",
"model_result",
"slope",
"intercept",
"adj.r.squared")
scaling_relation_comparison_df = data.frame(matrix(ncol = length(cols),
nrow = 0,
dimnames = list(NULL, cols)))
for (dataset_selected in datasets_selected){
scaling_relation_comp = topology_results_selected_by_size_df %>%
filter(type_dataset == dataset_selected) %>%
select(size, mean, type_dataset) %>%
unique() %>%
rename("S" = "mean") %>%
arrange(size) %>%
mutate(S_lag1 = if_else(size == min(size), 0, lag(S))) %>%
mutate(Fn = ((S - S_lag1) / S_lag1)) %>%
filter((!(is.infinite(Fn))) & (Fn > 0))
pl_summary = summary(lm(log(scaling_relation_comp$Fn)~log(scaling_relation_comp$size)))
pl_pred = exp(log(scaling_relation_comp$size) * coef(pl_summary)[2] + coef(pl_summary)[1])
ed_summary = summary(lm(scaling_relation_comp$Fn~scaling_relation_comp$size))
ed_pred = scaling_relation_comp$size * coef(ed_summary)[2] + coef(ed_summary)[1]
scaling_relation_comparison_df = rbind(scaling_relation_comparison_df,
cbind(scaling_relation_comp,
data.frame(model = "Power law",
model_result = pl_pred),
data.frame(slope = coef(pl_summary)[2],
intercept = coef(pl_summary)[1],
adj.r.squared = pl_summary$adj.r.squared)))
scaling_relation_comparison_df = rbind(scaling_relation_comparison_df,
cbind(scaling_relation_comp,
data.frame(model = "Exponential decay",
model_result = ed_pred),
data.frame(slope = coef(ed_summary)[2],
intercept = coef(ed_summary)[1],
adj.r.squared = ed_summary$adj.r.squared)))
}
# Process results
scaling_relation_comparison_df = scaling_relation_comparison_df %>%
filter(Fn > 0) %>%
mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
mutate_at(c('adj.r.squared'), as.character) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "tcga:tcga-brca_female",
"TCGA: Breast cancer")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
inner_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("model")),
by=c("model") )
scaling_relation_comparison_df$r2labels =
ifelse(scaling_relation_comparison_df$model == "Power law",
paste("Power law: ",
unique((scaling_relation_comparison_df %>%
filter(model == "Power law"))$adj.r.squared),
sep=""),
ifelse(scaling_relation_comparison_df$model == "Exponential decay",
paste("Exp. decay: ",
unique((scaling_relation_comparison_df %>%
filter(model == "Exponential decay"))$adj.r.squared),
sep=""),
""))
# Plot with labels
scaling_relation_plot_comparison = scaling_relation_comparison_df %>%
ggplot(aes(x=size, y=Fn)) +
geom_point(alpha=0.9, size=3, col="#0072B2") +
#geom_abline(aes(intercept=intercept, slope=slope, col=model), size=1) +
geom_line(aes(x= size, y=model_result, col=r2labels), size=1) +
theme_linedraw() +
xlab("n") +
ylab(bquote(bold((S[n]-S[n-1])/S[n-1]))) +
scale_color_manual(guide = 'legend', values=as.vector(name2color[levels(factor(scaling_relation_comparison_df$model))])) +
#scale_colour_identity(guide = 'legend', labels=levels(factor(scaling_relation_comparison_df$r2labels))) +
guides(col=guide_legend(title=bquote(bold(R^2)))) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
#legend.position="none") +
legend.position = c(0.64, 0.85),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
legend.background = element_rect(fill = 'transparent', color = "transparent"))
#legend.background = element_rect(linewidth=0.2, linetype="solid", colour ="black"))
plot_file = paste(plots_dir, "/comparison_pl_ed_pearson_pval_0.05_", dataset_selected, ".png", sep="")
ggsave(
plot_file,
plot=scaling_relation_plot_comparison,
dpi = 1200,
#width = 9500,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(scaling_relation_plot_comparison)
Demonstration of the model goodness-of-fit for 1 model:
models_selected = c("Stretched exponential (by linear fit)")
#dataset_selected = "gtex:whole.blood"
dataset_selected = "tcga:tcga-brca_female"
comparison_palette = name2color[c(dataset_selected, "Analytical model")]
# Join predicted results with mean
predicted_results_norm_and_mean_df <-
rbind((topology_results_selected_by_size_norm_df %>%
dplyr::select(type_dataset, size, mean_norm) %>%
unique() %>%
rename("model_result" = "mean_norm") %>%
mutate(model="Mean")),
(predicted_results_norm_df %>%
dplyr::select(type_dataset, size, model_result, model) %>%
unique()
%>% mutate_at(c('model_result'), as.numeric))) %>%
filter(model %in% models_selected) %>%
mutate(model = replace(model,
model == "Stretched exponential (by linear fit)",
"Analytical model"))
predicted_results_norm_and_mean_df$model <- factor(predicted_results_norm_and_mean_df$model, levels = c("Mean", "Analytical model", "Logarithm"))
# Process data
goodnessfit_df = results_selected_norm_df %>%
dplyr::select(type_dataset, size, rep, num_edges) %>%
right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
filter(type_dataset == dataset_selected) %>%
filter(size <= max((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
filter(size >= min((results_selected_norm_df %>% filter(type_dataset == dataset_selected))$size)) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("model") %>% mutate(model = replace(model, model == dataset_selected, "Mean"))), by=c("model") ) %>%
mutate(geom_text_repel_label=dplyr::if_else(model == "Logarithm" & size == 280 & rep == 1, "Logarithm", dplyr::if_else(model == "Analytical model" & size == 760 & rep == 1, "Analytical model", dplyr::if_else(model == "Mean" & size == 560 & rep == 1, "Mean", ""))))
# Plot with labels
goodnessfit_plot = goodnessfit_df %>%
ggplot(aes(x=size, y=num_edges)) +
geom_point(alpha=0.6, size=3, col=comparison_palette[[dataset_selected]]) +
geom_line(aes(x=size, y=model_result, col=rgb_col), size=1) +
scale_colour_identity() +
theme_linedraw() +
xlab("Num. samples") +
ylab("Frac. significant correlations") +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
geom_label_repel(aes(x=size, y=model_result, col=rgb_col, label=geom_text_repel_label),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 1,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234)
plot_file = paste(plots_dir, "/goodnessfit_model_pearson_pval_0.05_", dataset_selected, ".png", sep="")
print(goodnessfit_plot)
## Warning: Removed 53 rows containing missing values (`geom_point()`).
ggsave(
plot_file,
plot=goodnessfit_plot,
dpi = 1200,
#width = 6600,
#height = 6700,
units = c("px")
)
## Saving 8400 x 6000 px image
## Warning: Removed 53 rows containing missing values (`geom_point()`).
Demonstration of the model goodness-of-fit for 2 models:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"tcga:tcga-brca_female")
models_selected = c("Stretched exponential (by linear fit)")
# Join predicted results with mean
#predicted_results_norm_and_mean_df <-
predicted_results_mean_df <-
#rbind((topology_results_selected_by_size_norm_df %>%
rbind((topology_results_selected_by_size_df %>%
#dplyr::select("type_dataset", "size", "mean_norm") %>%
dplyr::select("type_dataset", "size", "mean") %>%
unique() %>%
#rename("model_result" = "mean_norm") %>%
rename("model_result" = "mean") %>%
mutate(model="Mean")),
#(predicted_results_norm_df %>%
(predicted_results_df %>%
dplyr::select(type_dataset, size, model_result, model) %>%
unique()
%>% mutate_at(c('model_result'), as.numeric))) %>%
filter(type_dataset %in% datasets_selected) %>%
filter(model %in% models_selected) %>%
inner_join((analytical_model_summary_df %>%
dplyr::select("type_dataset", "model", "adj.r.squared")),
by = c("type_dataset", "model")) %>%
mutate_at(c('adj.r.squared'), ~sprintf("%.2f",.)) %>%
mutate_at(c('adj.r.squared'), as.character)
# Process data
#goodnessfit_lessdatasets_df = results_selected_norm_df %>%
goodnessfit_lessdatasets_df = results_selected_df %>%
dplyr::select(type_dataset, size, rep, num_edges) %>%
#right_join(predicted_results_norm_and_mean_df, by=c("type_dataset", "size")) %>%
right_join(predicted_results_mean_df, by=c("type_dataset", "size")) %>%
filter(type_dataset %in% datasets_selected) %>%
#filter(size <= max((results_selected_norm_df %>%
filter(size <= max((results_selected_df %>%
filter(type_dataset %in% datasets_selected))$size)) %>%
#filter(size >= min((results_selected_norm_df %>%
filter(size >= min((results_selected_df %>%
filter(type_dataset %in% datasets_selected))$size)) %>%
filter(!(is.na(num_edges))) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "gtex:breast.mammary.tissue_female", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "tcga:tcga-brca_female", "TCGA: Breast cancer")) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") ) %>%
mutate(geom_text_repel_label=dplyr::if_else(type_dataset == "R. arthritis" & size == 240 & rep == 1, "R. arthritis", dplyr::if_else(type_dataset == "GTEx: Whole blood" & size == 700 & rep == 1, "GTEx: Whole blood", dplyr::if_else(type_dataset == "GTEx: Lung" & size == 560 & rep == 1, "GTEx: Lung", dplyr::if_else(type_dataset == "GTEx: Breast" & size == 140 & rep == 1, "GTEx: Breast", dplyr::if_else(type_dataset == "TCGA: Lung cancer" & size == 460 & rep == 1, "TCGA: Lung cancer", dplyr::if_else(type_dataset == "TCGA: Breast cancer" & size == 1000 & rep == 1, "TCGA: Breast cancer", "")))))))
goodnessfit_lessdatasets_df$r2labels =
ifelse(goodnessfit_lessdatasets_df$type_dataset == "GTEx: Breast",
paste("GTEx Breast: ",
unique((goodnessfit_lessdatasets_df %>%
filter(type_dataset == "GTEx: Breast"))$adj.r.squared),
sep=""),
ifelse(goodnessfit_lessdatasets_df$type_dataset == "TCGA: Breast cancer",
paste("TCGA Breast c.: ",
unique((goodnessfit_lessdatasets_df %>%
filter(type_dataset == "TCGA: Breast cancer"))$adj.r.squared),
sep=""),
""))
# Plot with labels
goodnessfit_plot_lessdatasets = goodnessfit_lessdatasets_df %>%
ggplot(aes(x=size, y=num_edges, col=r2labels)) +
geom_point(alpha=0.6, size=3) +
geom_line(aes(x=size, y=model_result, col=r2labels), size=1) +
#scale_colour_identity() +
scale_color_manual(guide = 'legend', values=as.vector(name2color[levels(factor(goodnessfit_lessdatasets_df$type_dataset))])) +
theme_linedraw() +
xlab("Num. samples") +
ylab("Frac. significant correlations") +
guides(col=guide_legend(title=bquote(bold(.('Analytical model') ~ R^2)))) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
#legend.position="none") +
legend.position = c(0.64, 0.15),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
#legend.background = element_rect(linewidth=0.2, linetype="solid", colour ="black")
legend.background = element_rect(fill = "transparent", color = "transparent")
)
# geom_label_repel(aes(x=size, y=model_result, col=rgb_col, label=geom_text_repel_label),
# fill="white",
# box.padding = 0.5, max.overlaps = Inf,
# label.size = 0.75,
# size = 5,
# alpha = 1,
# label.padding=0.25,
# fontface = "bold",
# na.rm=TRUE,
# seed = 1234)
plot_file = paste(plots_dir, "/goodnessfit_2models_pearson_pval_0.05_", dataset_selected, ".png", sep="")
print(goodnessfit_plot_lessdatasets)
ggsave(
plot_file,
plot = goodnessfit_plot_lessdatasets,
dpi = 1200,
#width = 6600,
#height = 6700,
units = c("px")
)
## Saving 8400 x 6000 px image
Define functions:
#' calculate_predictions_using_stretched_exponential_model_optimized
#' Formula to calculate exponential decay of significant interactions from a list of sample sizes.
#' This formula requires the use of the L parameter
#' @param x List of sample sizes.
#' @param a Slope coefficient.
#' @param b Intercept coefficient.
#'
calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
y = L * exp((b * x ** (-a + 1)) / (-a + 1))
#y = exp(log(L) - exp(b+a*log(x)))
return(y)
}
#' calculate_prediction_from_analytical_model
#' Function to calculate predictions using a specific analytical model
#' @param model Name of the model used.
#' @param x_list List of values in the x axis (e.g. size)
#' @param a Parameter a.
#' @param b Parameter b.
#' @param L Parameter L.
#'
calculate_prediction_from_analytical_model = function(model, x_list, a, b, L){
if(model == "Logarithmic"){
prediction_result = (log(x_list) *a + b)
} else if((model == "Stretched exponential (by optimization)") | (model == "Stretched exponential (by linear fit)")){
prediction_result = calculate_predictions_using_stretched_exponential_model_optimized(x=x_list, L=L, a=a, b=b)
} else if(model == "Stretched exponential (without L)"){
prediction_result = calculate_predictions_using_stretched_exponential_model_without_L(x=x_list, a=a, b=b)
}
return(prediction_result)
}
Calculate predictions of the models:
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca")
#datasets_selected = c("gtex:whole.blood", "gtex:artery.tibial", "tcga:tcga-coad", "tcga:tcga-brca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
#datasets_selected = c("gtex:breast.mammary.tissue", "gtex:thyroid", "gtex:whole.blood", "tcga:tcga-brca", "tcga:tcga-thca", "scipher:scipher.complete.dataset")
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca_female",
"tcga:tcga-luad",
"scipher:scipher.sample.per.patient.baseline")
#model_selected = "Stretched exponential (by optimization)"
model_selected = "Stretched exponential (by linear fit)"
sample_size_vs_a_df = analytical_model_summary_df %>%
inner_join(sample_size_correlation_df,
by = c("type_dataset" = "dataset")) %>%
#filter((type_dataset %in% datasets_selected) & (model == model_selected))
filter(model == model_selected)
# Calculate number of edges for each critical correlation using model
sample_size_vs_a_df$num_edges_from_statistical_corrected =
calculate_prediction_from_analytical_model(model = model_selected,
x_list = sample_size_vs_a_df$sample_size_statistical_corrected,
a = sample_size_vs_a_df$a,
b = sample_size_vs_a_df$b,
L = sample_size_vs_a_df$L)
sample_size_vs_a_df$num_edges_from_statistical_corrected =
sample_size_vs_a_df$num_edges_from_statistical_corrected * sample_size_vs_a_df$max_value_in_dataset
sample_size_vs_a_df$num_edges_from_statistical_corrected_norm =
sample_size_vs_a_df$num_edges_from_statistical_corrected / sample_size_vs_a_df$total_num_edges
# sample_size_vs_a_df = sample_size_vs_a_df %>%
# tidyr::separate(type_dataset,
# into = c("dataset", NA),
# sep = ":",
# remove = FALSE) %>%
# tidyr::separate(type_dataset,
# into = c("dataset_name", "sex"),
# sep = "_",
# remove = TRUE) %>%
# mutate(subclassification =
# if_else(dataset == "tcga",
# "tumor",
# "")) %>%
# mutate(dataset_name =
# if_else(dataset == "tcga",
# paste(dataset_name,
# subclassification,
# sep = "-"),
# dataset_name)) %>%
# mutate(dataset_name =
# if_else(!(is.na(sex)),
# paste(dataset_name,
# sex,
# sep = "_"),
# dataset_name))
sample_size_vs_a_df %>% head(10)
## type_dataset
## 1 scipher:scipher.complete.dataset
## 2 scipher:scipher.complete.dataset
## 3 scipher:scipher.complete.dataset
## 4 scipher:scipher.complete.dataset
## 5 scipher:scipher.sample.per.patient.baseline
## 6 scipher:scipher.sample.per.patient.baseline
## 7 scipher:scipher.sample.per.patient.baseline
## 8 scipher:scipher.sample.per.patient.baseline
## 9 gtex:breast.mammary.tissue
## 10 gtex:breast.mammary.tissue
## model a b L
## 1 Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 2 Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 3 Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 4 Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 5 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 6 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 7 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 8 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 9 Stretched exponential (by linear fit) 1.720270 43.15181 2.114016
## 10 Stretched exponential (by linear fit) 1.720270 43.15181 2.114016
## max_value_in_dataset
## 1 65309202
## 2 65309202
## 3 65309202
## 4 65309202
## 5 40568917
## 6 40568917
## 7 40568917
## 8 40568917
## 9 74740207
## 10 74740207
## formula adj.r.squared
## 1 F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ] 0.9868786
## 2 F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ] 0.9868786
## 3 F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ] 0.9868786
## 4 F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ] 0.9868786
## 5 F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ] 0.9908833
## 6 F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ] 0.9908833
## 7 F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ] 0.9908833
## 8 F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ] 0.9908833
## 9 F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ] 0.9871514
## 10 F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ] 0.9871514
## relative.error.mean unnorm_L total_num_edges density subclassification
## 1 0.1392092 95516931 95516931 1
## 2 0.1392092 95516931 95516931 1
## 3 0.1392092 95516931 95516931 1
## 4 0.1392092 95516931 95516931 1
## 5 0.1386634 94620646 94620646 1
## 6 0.1386634 94620646 94620646 1
## 7 0.1386634 94620646 94620646 1
## 8 0.1386634 94620646 94620646 1
## 9 0.1889067 158001976 158001976 1
## 10 0.1889067 158001976 158001976 1
## L_vs_total dataset_name dataset sex
## 1 0 scipher:scipher.complete.dataset scipher
## 2 0 scipher:scipher.complete.dataset scipher
## 3 0 scipher:scipher.complete.dataset scipher
## 4 0 scipher:scipher.complete.dataset scipher
## 5 0 scipher:scipher.sample.per.patient.baseline scipher
## 6 0 scipher:scipher.sample.per.patient.baseline scipher
## 7 0 scipher:scipher.sample.per.patient.baseline scipher
## 8 0 scipher:scipher.sample.per.patient.baseline scipher
## 9 0 gtex:breast.mammary.tissue gtex
## 10 0 gtex:breast.mammary.tissue gtex
## type_correlation correlation sample_size_statistical
## 1 weak 0.2 68.773025
## 2 moderate 0.4 18.002295
## 3 strong 0.6 8.523611
## 4 very strong 0.8 5.063346
## 5 weak 0.2 68.773025
## 6 moderate 0.4 18.002295
## 7 strong 0.6 8.523611
## 8 very strong 0.8 5.063346
## 9 weak 0.2 68.773025
## 10 moderate 0.4 18.002295
## sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1 914.63989 74520763
## 2 216.05522 43934688
## 3 85.91378 19096937
## 4 38.90078 4702026
## 5 914.18952 69148495
## 6 215.94977 37083170
## 7 85.87258 14367043
## 8 38.88278 3042116
## 9 938.69085 102474141
## 10 221.68638 46440188
## num_edges_from_statistical_corrected_norm
## 1 0.78018380
## 2 0.45996754
## 3 0.19993248
## 4 0.04922715
## 5 0.73079712
## 6 0.39191415
## 7 0.15183835
## 8 0.03215065
## 9 0.64856240
## 10 0.29392157
Save the predictions:
type_correlation_selected = "weak"
sample_size_vs_a_df %>%
filter(type_correlation == type_correlation_selected) %>%
arrange(desc(a))
## type_dataset
## 1 gse193677:rectum_uc_inflamed
## 2 tcga:lung
## 3 gse193677:rectum_control_noninflamed
## 4 gse193677:rectum_cd_inflamed
## 5 gtex:skin.sun.exposed.lower.leg
## 6 tcga:breast_female
## 7 gtex:breast.mammary.tissue_female
## 8 scipher:scipher.complete.dataset
## 9 tcga:kidney
## 10 scipher:scipher.sample.per.patient.baseline
## 11 gtex:thyroid
## 12 gtex:breast.mammary.tissue
## 13 tcga:tcga-kirp
## 14 gtex:lung
## 15 gtex:whole.blood
## 16 tcga:tcga-luad
## 17 tcga:tcga-thca
## 18 tcga:tcga-kirc
## 19 tcga:tcga-lusc
## 20 tcga:brca.luma
## 21 tcga:tcga-brca_female
## 22 tcga:brca.lumb
## model a b L
## 1 Stretched exponential (by linear fit) 2.198803 214.47311 1.902216
## 2 Stretched exponential (by linear fit) 2.183346 177.44983 2.056570
## 3 Stretched exponential (by linear fit) 1.942484 96.46064 1.913428
## 4 Stretched exponential (by linear fit) 1.898637 79.73148 4.555803
## 5 Stretched exponential (by linear fit) 1.836919 69.54141 1.443081
## 6 Stretched exponential (by linear fit) 1.836527 71.30603 5.974945
## 7 Stretched exponential (by linear fit) 1.811947 64.89496 3.885985
## 8 Stretched exponential (by linear fit) 1.790429 42.98998 1.462534
## 9 Stretched exponential (by linear fit) 1.771018 52.59051 5.738395
## 10 Stretched exponential (by linear fit) 1.758283 41.83477 2.332343
## 11 Stretched exponential (by linear fit) 1.746114 46.00874 1.669286
## 12 Stretched exponential (by linear fit) 1.720270 43.15181 2.114016
## 13 Stretched exponential (by linear fit) 1.668704 38.50169 3.923615
## 14 Stretched exponential (by linear fit) 1.654318 28.07262 1.964261
## 15 Stretched exponential (by linear fit) 1.650687 21.33857 1.586486
## 16 Stretched exponential (by linear fit) 1.649890 37.43779 2.749527
## 17 Stretched exponential (by linear fit) 1.610579 27.46556 2.737581
## 18 Stretched exponential (by linear fit) 1.565623 23.34566 3.378756
## 19 Stretched exponential (by linear fit) 1.549679 25.93423 4.415126
## 20 Stretched exponential (by linear fit) 1.528906 21.43228 4.180236
## 21 Stretched exponential (by linear fit) 1.478753 17.41578 3.599061
## 22 Stretched exponential (by linear fit) 1.441041 14.12650 19.001680
## max_value_in_dataset
## 1 87033411
## 2 89070336
## 3 86523419
## 4 36339651
## 5 109822205
## 6 32891554
## 7 42299605
## 8 65309202
## 9 32872637
## 10 40568917
## 11 95645379
## 12 74740207
## 13 43843097
## 14 82744314
## 15 72312862
## 16 67033504
## 17 64735459
## 18 55176637
## 19 43223551
## 20 44003689
## 21 50774187
## 22 9515666
## formula adj.r.squared
## 1 F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ] 0.9735496
## 2 F(x) = 2.06 * exp[(177.45 * x**((2.18) + 1)) / ((2.18) + 1) ] 0.9824779
## 3 F(x) = 1.91 * exp[(96.46 * x**((1.94) + 1)) / ((1.94) + 1) ] 0.9962786
## 4 F(x) = 4.56 * exp[(79.73 * x**((1.9) + 1)) / ((1.9) + 1) ] 0.9791491
## 5 F(x) = 1.44 * exp[(69.54 * x**((1.84) + 1)) / ((1.84) + 1) ] 0.9921006
## 6 F(x) = 5.97 * exp[(71.31 * x**((1.84) + 1)) / ((1.84) + 1) ] 0.9939891
## 7 F(x) = 3.89 * exp[(64.89 * x**((1.81) + 1)) / ((1.81) + 1) ] 0.9963943
## 8 F(x) = 1.46 * exp[(42.99 * x**((1.79) + 1)) / ((1.79) + 1) ] 0.9868786
## 9 F(x) = 5.74 * exp[(52.59 * x**((1.77) + 1)) / ((1.77) + 1) ] 0.9917808
## 10 F(x) = 2.33 * exp[(41.83 * x**((1.76) + 1)) / ((1.76) + 1) ] 0.9908833
## 11 F(x) = 1.67 * exp[(46.01 * x**((1.75) + 1)) / ((1.75) + 1) ] 0.9898823
## 12 F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ] 0.9871514
## 13 F(x) = 3.92 * exp[(38.5 * x**((1.67) + 1)) / ((1.67) + 1) ] 0.9948093
## 14 F(x) = 1.96 * exp[(28.07 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9838666
## 15 F(x) = 1.59 * exp[(21.34 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9832295
## 16 F(x) = 2.75 * exp[(37.44 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9979574
## 17 F(x) = 2.74 * exp[(27.47 * x**((1.61) + 1)) / ((1.61) + 1) ] 0.9858457
## 18 F(x) = 3.38 * exp[(23.35 * x**((1.57) + 1)) / ((1.57) + 1) ] 0.9956269
## 19 F(x) = 4.42 * exp[(25.93 * x**((1.55) + 1)) / ((1.55) + 1) ] 0.9853471
## 20 F(x) = 4.18 * exp[(21.43 * x**((1.53) + 1)) / ((1.53) + 1) ] 0.9977903
## 21 F(x) = 3.6 * exp[(17.42 * x**((1.48) + 1)) / ((1.48) + 1) ] 0.9974590
## 22 F(x) = 19 * exp[(14.13 * x**((1.44) + 1)) / ((1.44) + 1) ] 0.9857087
## relative.error.mean unnorm_L total_num_edges density
## 1 0.52545708 165556306 165556306 1
## 2 0.25787164 183179370 183179370 1
## 3 0.10243415 165556306 165556306 1
## 4 0.23290398 165556306 165556306 1
## 5 0.13088726 158482306 158482306 1
## 6 0.10866531 196525225 196525225 1
## 7 0.30863165 164375646 164375646 1
## 8 0.13920918 95516931 95516931 1
## 9 0.17617336 188636176 188636176 1
## 10 0.13866336 94620646 94620646 1
## 11 0.14868418 159659515 159659515 1
## 12 0.18890669 158001976 158001976 1
## 13 0.16316561 172023426 172023426 1
## 14 0.28316122 162531435 162531435 1
## 15 0.16228112 114723378 114723378 1
## 16 0.08379174 184310400 184310400 1
## 17 0.16020709 177218551 177218551 1
## 18 0.08005273 186428395 186428395 1
## 19 0.25501658 190837416 190837416 1
## 20 0.06539910 183945790 183945790 1
## 21 0.06618629 182739403 182739403 1
## 22 0.20213896 180813636 180813636 1
## subclassification L_vs_total
## 1 disease_inflamed_vs_control 0
## 2 tissue 0
## 3 disease_inflamed_vs_control 0
## 4 disease_inflamed_vs_control 0
## 5 0
## 6 tissue 0
## 7 0
## 8 0
## 9 tissue 0
## 10 0
## 11 0
## 12 0
## 13 tumor 0
## 14 0
## 15 0
## 16 tumor 0
## 17 tumor 0
## 18 tumor 0
## 19 tumor 0
## 20 subtype 0
## 21 tumor 0
## 22 subtype 0
## dataset_name dataset sex
## 1 gse193677:rectum-disease_inflamed_vs_control_uc gse193677 uc
## 2 tcga:lung-tissue tcga
## 3 gse193677:rectum-disease_inflamed_vs_control_control gse193677 control
## 4 gse193677:rectum-disease_inflamed_vs_control_cd gse193677 cd
## 5 gtex:skin.sun.exposed.lower.leg gtex
## 6 tcga:breast-tissue_female tcga female
## 7 gtex:breast.mammary.tissue_female gtex female
## 8 scipher:scipher.complete.dataset scipher
## 9 tcga:kidney-tissue tcga
## 10 scipher:scipher.sample.per.patient.baseline scipher
## 11 gtex:thyroid gtex
## 12 gtex:breast.mammary.tissue gtex
## 13 tcga:tcga-kirp-tumor tcga
## 14 gtex:lung gtex
## 15 gtex:whole.blood gtex
## 16 tcga:tcga-luad-tumor tcga
## 17 tcga:tcga-thca-tumor tcga
## 18 tcga:tcga-kirc-tumor tcga
## 19 tcga:tcga-lusc-tumor tcga
## 20 tcga:brca.luma-subtype tcga
## 21 tcga:tcga-brca-tumor_female tcga female
## 22 tcga:brca.lumb-subtype tcga
## type_correlation correlation sample_size_statistical
## 1 weak 0.2 68.77302
## 2 weak 0.2 68.77302
## 3 weak 0.2 68.77302
## 4 weak 0.2 68.77302
## 5 weak 0.2 68.77302
## 6 weak 0.2 68.77302
## 7 weak 0.2 68.77302
## 8 weak 0.2 68.77302
## 9 weak 0.2 68.77302
## 10 weak 0.2 68.77302
## 11 weak 0.2 68.77302
## 12 weak 0.2 68.77302
## 13 weak 0.2 68.77302
## 14 weak 0.2 68.77302
## 15 weak 0.2 68.77302
## 16 weak 0.2 68.77302
## 17 weak 0.2 68.77302
## 18 weak 0.2 68.77302
## 19 weak 0.2 68.77302
## 20 weak 0.2 68.77302
## 21 weak 0.2 68.77302
## 22 weak 0.2 68.77302
## sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1 940.9234 157679996
## 2 945.7592 175094031
## 3 940.9234 140899038
## 4 940.9234 137078882
## 5 938.8359 120953460
## 6 949.1215 149208667
## 7 940.5813 120811364
## 8 914.6399 74520763
## 9 947.1626 133474106
## 10 914.1895 69148495
## 11 939.1897 109922481
## 12 938.6909 102474141
## 13 942.7552 95305292
## 14 940.0419 99917128
## 15 923.3936 78008662
## 16 946.0535 94258238
## 17 944.1776 89214478
## 18 946.5998 79233906
## 19 947.7173 64143529
## 20 945.9588 62419293
## 21 945.6442 46512537
## 22 945.1377 37979699
## num_edges_from_statistical_corrected_norm
## 1 0.9524252
## 2 0.9558611
## 3 0.8510642
## 4 0.8279895
## 5 0.7631985
## 6 0.7592342
## 7 0.7349712
## 8 0.7801838
## 9 0.7075743
## 10 0.7307971
## 11 0.6884806
## 12 0.6485624
## 13 0.5540251
## 14 0.6147557
## 15 0.6799718
## 16 0.5114103
## 17 0.5034150
## 18 0.4250099
## 19 0.3361161
## 20 0.3393353
## 21 0.2545293
## 22 0.2100489
a_vs_fraction_corr_file = paste(tables_dir,
"a_vs_fraction_sig_correlations_pearson_pval_0.05.txt",
sep = "/")
sample_size_vs_a_df %>%
arrange(desc(a)) %>%
fwrite(a_vs_fraction_corr_file)
Plot alpha vs. fraction of correlations above 0.2:
# Process data
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
"gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:skin.sun.exposed.lower.leg",
"gtex:thyroid",
"gtex:whole.blood",
"tcga:tcga-brca-tumor_female",
"tcga:tcga-kirc-tumor",
"tcga:tcga-kirp-tumor",
"tcga:tcga-luad-tumor",
"tcga:tcga-lusc-tumor",
"tcga:tcga-thca-tumor",
"tcga:breast-tissue_female",
"tcga:kidney-tissue",
"tcga:lung-tissue",
"tcga:brca.luma-subtype",
"tcga:brca.lumb-subtype")
sample_size_vs_a_plot_df = sample_size_vs_a_df %>%
filter(type_correlation == type_correlation_selected) %>%
filter(dataset_name %in% datasets_selected) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "scipher:scipher.sample.per.patient.baseline",
"R. arthritis")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:whole.blood",
"GTEx: Whole blood")) %>%
#mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:thyroid", "GTEx: Thyroid")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:lung",
"GTEx: Lung")) %>%
#mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
#mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
#mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-thca", "TCGA: Thyroid cancer")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-luad-tumor",
"TCGA: Lung cancer")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-brca-tumor_female",
"TCGA: Breast cancer")) %>%
left_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col" = ".") %>%
tibble::rownames_to_column("dataset_name")),
by = c("dataset_name") ) %>%
mutate(rgb_col = replace(rgb_col, is.na(rgb_col), "#000000")) %>% # Non-selected datasets with color black
mutate(geom_text_repel_label=dplyr::if_else(dataset_name == "R. arthritis", "R. arthritis",
dplyr::if_else(dataset_name == "GTEx: Whole blood", "GTEx: Whole blood",
dplyr::if_else(dataset_name == "GTEx: Lung", "GTEx: Lung",
dplyr::if_else(dataset_name == "GTEx: Breast", "GTEx: Breast",
dplyr::if_else(dataset_name == "TCGA: Lung cancer", "TCGA: Lung cancer",
dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", "")))))))
# Calculate correlation
ss_vs_a_cor <- cor(sample_size_vs_a_plot_df$a, sample_size_vs_a_plot_df$num_edges_from_statistical_corrected_norm)
# Calculate regression
lm_summary <- summary(lm(sample_size_vs_a_plot_df$num_edges_from_statistical_corrected_norm ~ sample_size_vs_a_plot_df$a))
slope <- coef(lm_summary)[2]
intercept <- coef(lm_summary)[1]
adj.r.squared <- lm_summary$adj.r.squared
lm_cor_result <- paste("R² =",
round(adj.r.squared, 3),
"\ncorr. =",
round(ss_vs_a_cor, 3))
# Make scatter plot
a_vs_fraction_sig_correlations_plot <- sample_size_vs_a_plot_df %>%
ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) +
geom_point(aes(col=rgb_col), show.legend = FALSE) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col, label = geom_text_repel_label),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 5555) +
new_scale_color() +
geom_abline(aes(slope = slope,
intercept = intercept,
col = lm_cor_result),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
annotate("text", x = 2.07, y = 0.77, label = lm_cor_result, size = 5) +
xlab(expression(alpha)) +
ylab("Frac. correlations above 0.2") +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face = "bold"),
axis.title = element_text(size = 17, face = "bold"),
axis.text = element_text(size = 16),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
#legend.position = c(0.70, 0.25),
#legend.text = element_text(size = 14),
#legend.title=element_blank())
print(a_vs_fraction_sig_correlations_plot)
plot_file <- paste(plots_dir,
"a_vs_fraction_sig_correlations_pearson_pval_0.05.png",
sep="/")
ggsave(
plot_file,
plot=a_vs_fraction_sig_correlations_plot,
dpi = 1200,
#width = 6400,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
Less datasets:
# Process data
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
"gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:skin.sun.exposed.lower.leg",
"gtex:thyroid",
"gtex:whole.blood",
"tcga:tcga-brca-tumor_female",
"tcga:tcga-kirc-tumor",
"tcga:tcga-kirp-tumor",
"tcga:tcga-luad-tumor",
"tcga:tcga-lusc-tumor",
"tcga:tcga-thca-tumor",
"tcga:breast-tissue_female",
"tcga:kidney-tissue",
"tcga:lung-tissue",
"tcga:brca.luma-subtype",
"tcga:brca.lumb-subtype")
sample_size_vs_a_plot_lessdatasets_df = sample_size_vs_a_df %>%
filter(type_correlation == type_correlation_selected) %>%
filter(dataset_name %in% datasets_selected) %>%
mutate(dataset = replace(dataset, dataset == "scipher", "R. arthritis"),
dataset = replace(dataset, dataset == "gtex", "GTEx"),
dataset = replace(dataset, dataset == "tcga", "TCGA")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-brca-tumor_female",
"TCGA: Breast cancer")) %>%
left_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col" = ".") %>%
tibble::rownames_to_column("dataset")# %>%
#filter(dataset_name %in% c("TCGA: Breast cancer", "GTEx: Breast"))
),
by = c("dataset") ) %>%
mutate(rgb_col = replace(rgb_col, is.na(rgb_col), "#000000")) %>% # Non-selected datasets with color black
mutate(geom_text_repel_label=dplyr::if_else(dataset_name == "R. arthritis", "R. arthritis",
dplyr::if_else(dataset_name == "GTEx: Whole blood", "GTEx: Whole blood",
dplyr::if_else(dataset_name == "GTEx: Lung", "GTEx: Lung",
dplyr::if_else(dataset_name == "GTEx: Breast", "GTEx: Breast",
dplyr::if_else(dataset_name == "TCGA: Lung cancer", "TCGA: Lung cancer",
dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", "")))))))
# Calculate correlation
a_vs_fr_cor <- cor(sample_size_vs_a_plot_lessdatasets_df$a, sample_size_vs_a_plot_lessdatasets_df$num_edges_from_statistical_corrected_norm)
# Calculate regression
lm_summary_avsfr <- summary(lm(sample_size_vs_a_plot_lessdatasets_df$num_edges_from_statistical_corrected_norm ~ sample_size_vs_a_plot_lessdatasets_df$a))
slope_avsfr <- coef(lm_summary_avsfr)[2]
intercept_avsfr <- coef(lm_summary_avsfr)[1]
adj.r.squared_avsfr <- lm_summary_avsfr$adj.r.squared
lm_cor_result_avsfr <- paste("R² =",
round(adj.r.squared_avsfr, 3),
"\ncorr. =",
round(a_vs_fr_cor, 3))
# Make scatter plot
a_vs_fraction_sig_correlations_lessdatasets_plot <- sample_size_vs_a_plot_lessdatasets_df %>%
ggplot(aes(x=a, y=num_edges_from_statistical_corrected_norm)) +
geom_point(aes(col=rgb_col), alpha = 0.6, size = 3, show.legend = TRUE) +
#scale_colour_identity() +
scale_colour_identity(labels = unique(sample_size_vs_a_plot_lessdatasets_df$dataset), breaks = unique(sample_size_vs_a_plot_lessdatasets_df$rgb_col), guide = "legend") +
geom_label_repel(aes(col = rgb_col, label = geom_text_repel_label),
fill="white",
box.padding = 0.5, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding = 0.25,
fontface = "bold",
na.rm = TRUE,
show.legend = FALSE,
max.iter = 1e5,
seed = 5555) +
geom_abline(aes(slope = slope_avsfr,
intercept = intercept_avsfr),
linetype = "dashed",
col = "gray50",
show.legend = FALSE) +
annotate("text", x = 2.00, y = 0.45, label = lm_cor_result_avsfr, size = 5) +
xlab(expression(alpha)) +
ylab("Frac. correlations above 0.2") +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face = "bold"),
axis.title = element_text(size = 17, face = "bold"),
axis.text = element_text(size = 16),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
#legend.position="none")
legend.position = c(0.2, 0.9),
legend.background = element_rect(fill = 'transparent', color = "transparent"),
legend.text = element_text(size = 16),
legend.title=element_blank())
print(a_vs_fraction_sig_correlations_lessdatasets_plot)
plot_file <- paste(plots_dir,
"a_vs_fraction_sig_correlations_pearson_pval_0.05_lessdatasets.png",
sep="/")
ggsave(
plot_file,
plot=a_vs_fraction_sig_correlations_lessdatasets_plot,
dpi = 1200,
#width = 6400,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
Create plots showing the variation of each gene across samples:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:whole.blood",
"tcga:tcga-brca-tumor_female",
"tcga:tcga-luad-tumor",
"scipher:scipher.sample.per.patient.baseline")
parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
type_counts_selected = "reads"
distribution_plots = list()
sd_genes_filtered = sd_genes_df %>%
filter(dataset_name %in% datasets_selected) %>%
filter(type_counts == type_counts_selected) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "scipher.sample.per.patient.baseline", "R. arthritis")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "Whole.Blood", "GTEx: Whole blood")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "Lung", "GTEx: Lung")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "Breast.Mammary.Tissue", "GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "TCGA-LUAD", "TCGA: Lung cancer")) %>%
mutate(type_dataset = replace(type_dataset, type_dataset == "TCGA-BRCA", "TCGA: Breast cancer")) %>%
inner_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_dataset")), by=c("type_dataset") )
print(unique(sd_genes_filtered$dataset_name)) # test to check if datasets are selected correctly
## [1] "gtex:breast.mammary.tissue_female"
## [2] "gtex:lung"
## [3] "gtex:whole.blood"
## [4] "tcga:tcga-brca-tumor_female"
## [5] "tcga:tcga-luad-tumor"
## [6] "scipher:scipher.sample.per.patient.baseline"
for (x in 1:length(parameters)){
parameter = parameters[x]
# # Plot with legend
# distribution_plot = sd_genes_filtered %>%
# ggplot(aes(.data[[parameter]], color=type_dataset)) +
# geom_freqpoly(size=1.5) +
# scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#65F3BF", "#0072B2", "#56B4E9")) +
# scale_x_log10() +
# scale_y_log10() +
# labs(x=paste("Gene expression ", parameter2label[[parameter]], " (Log)", sep=""), y = "Number of genes (Log)") +
# theme_linedraw() +
# guides(col=guide_legend(title="Dataset")) +
# theme(aspect.ratio=1, plot.title = element_text(size = 20, face="bold"), axis.title = element_text(size = 17, face="bold"), axis.text = element_text(size = 16), legend.text = element_text(size = 16), legend.title=element_text(size=16, face="bold"))
# Plot without legend
distribution_plot = sd_genes_filtered %>%
ggplot(aes(.data[[parameter]], color=rgb_col)) +
geom_freqpoly(size=1.5) +
scale_colour_identity() +
scale_x_log10() +
scale_y_log10() +
labs(x=paste("Gene expression ", parameter2label[[parameter]], " (Log)", sep=""), y = "Number of genes (Log)") +
theme_linedraw() +
guides(col=guide_legend(title="Dataset")) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
# Save distribution plot without changes in the theme, because we will tune it afterwards with patchwork
distribution_plots[[x]] = distribution_plot
plot_file = paste(plots_dir, "/distribution_", parameter, "_genes.png", sep="")
ggsave(
plot_file,
plot=distribution_plot,
dpi = 1200,
#width = 10000,
#height = 6000,
units = c("px")
)
print(distribution_plot)
}
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
Less datasets:
datasets_selected = c("gtex:breast.mammary.tissue_female",
"tcga:tcga-brca-tumor_female")
parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
type_counts_selected = "reads"
distribution_plots_lessdatasets = list()
sd_genes_filtered = sd_genes_df %>%
filter(dataset_name %in% datasets_selected) %>%
filter(type_counts == type_counts_selected) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "Breast.Mammary.Tissue",
"GTEx: Breast")) %>%
mutate(type_dataset = replace(type_dataset,
type_dataset == "TCGA-BRCA",
"TCGA: Breast cancer")) %>%
inner_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("type_dataset")),
by=c("type_dataset") )
print(unique(sd_genes_filtered$dataset_name)) # test to check if datasets are selected correctly
## [1] "gtex:breast.mammary.tissue_female" "tcga:tcga-brca-tumor_female"
for (x in 1:length(parameters)){
parameter = parameters[x]
# Plot without legend
distribution_plot_lessdatasets = sd_genes_filtered %>%
ggplot(aes(.data[[parameter]], color=rgb_col)) +
geom_freqpoly(size=1.5) +
scale_colour_identity() +
scale_x_log10() +
scale_y_log10() +
labs(x=paste("Gene expression ",
parameter2label[[parameter]],
" (Log)", sep=""),
y = "Number of genes (Log)") +
theme_linedraw() +
guides(col=guide_legend(title="Dataset")) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
# Save distribution plot without changes in the theme, because we will tune it
# afterwards with patchwork
distribution_plots_lessdatasets[[x]] = distribution_plot_lessdatasets
plot_file = paste(plots_dir,
"/distribution_",
parameter,
"_genes_lessdatasets.png",
sep="")
ggsave(
plot_file,
plot=distribution_plot_lessdatasets,
dpi = 1200,
#width = 10000,
#height = 6000,
units = c("px")
)
print(distribution_plot_lessdatasets)
}
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Saving 8400 x 6000 px image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
Show numbers:
a_vs_fraction_corr_file =
paste(tables_dir,
"a_vs_fraction_sig_correlations_pearson_pval_0.05.txt",
sep='/')
a_vs_fraction_corr_df = fread(a_vs_fraction_corr_file)
head(a_vs_fraction_corr_df)
## type_dataset model a
## 1: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 2: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 3: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 4: gse193677:rectum_uc_inflamed Stretched exponential (by linear fit) 2.198803
## 5: tcga:lung Stretched exponential (by linear fit) 2.183346
## 6: tcga:lung Stretched exponential (by linear fit) 2.183346
## b L max_value_in_dataset
## 1: 214.4731 1.902216 87033411
## 2: 214.4731 1.902216 87033411
## 3: 214.4731 1.902216 87033411
## 4: 214.4731 1.902216 87033411
## 5: 177.4498 2.056570 89070336
## 6: 177.4498 2.056570 89070336
## formula adj.r.squared
## 1: F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ] 0.9735496
## 2: F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ] 0.9735496
## 3: F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ] 0.9735496
## 4: F(x) = 1.9 * exp[(214.47 * x**((2.2) + 1)) / ((2.2) + 1) ] 0.9735496
## 5: F(x) = 2.06 * exp[(177.45 * x**((2.18) + 1)) / ((2.18) + 1) ] 0.9824779
## 6: F(x) = 2.06 * exp[(177.45 * x**((2.18) + 1)) / ((2.18) + 1) ] 0.9824779
## relative.error.mean unnorm_L total_num_edges density
## 1: 0.5254571 165556306 165556306 1
## 2: 0.5254571 165556306 165556306 1
## 3: 0.5254571 165556306 165556306 1
## 4: 0.5254571 165556306 165556306 1
## 5: 0.2578716 183179370 183179370 1
## 6: 0.2578716 183179370 183179370 1
## subclassification L_vs_total
## 1: disease_inflamed_vs_control 0
## 2: disease_inflamed_vs_control 0
## 3: disease_inflamed_vs_control 0
## 4: disease_inflamed_vs_control 0
## 5: tissue 0
## 6: tissue 0
## dataset_name dataset sex
## 1: gse193677:rectum-disease_inflamed_vs_control_uc gse193677 uc
## 2: gse193677:rectum-disease_inflamed_vs_control_uc gse193677 uc
## 3: gse193677:rectum-disease_inflamed_vs_control_uc gse193677 uc
## 4: gse193677:rectum-disease_inflamed_vs_control_uc gse193677 uc
## 5: tcga:lung-tissue tcga
## 6: tcga:lung-tissue tcga
## type_correlation correlation sample_size_statistical
## 1: weak 0.2 68.773025
## 2: moderate 0.4 18.002295
## 3: strong 0.6 8.523611
## 4: very strong 0.8 5.063346
## 5: weak 0.2 68.773025
## 6: moderate 0.4 18.002295
## sample_size_statistical_corrected num_edges_from_statistical_corrected
## 1: 940.92339 157679996
## 2: 222.20909 125753013
## 3: 88.31798 72105576
## 4: 39.95108 19259015
## 5: 945.75920 175094031
## 6: 223.34132 142792927
## num_edges_from_statistical_corrected_norm
## 1: 0.9524252
## 2: 0.7595785
## 3: 0.4355351
## 4: 0.1163291
## 5: 0.9558611
## 6: 0.7795252
sd_cut <- 10
mean_cut <- 10
cv_cut <- 50
sd_genes_df %>%
filter(type_counts == type_counts_selected) %>%
group_by(dataset_name, subclassification) %>%
summarize(mean_sd = mean(sd),
median_sd = median(sd),
mean_mean = mean(mean),
median_mean = median(mean),
mean_cv = mean(cv),
median_cv = median(cv)) %>%
head(10)
## `summarise()` has grouped output by 'dataset_name'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 8
## # Groups: dataset_name [10]
## dataset_name subcl…¹ mean_sd media…² mean_…³ media…⁴ mean_cv media…⁵
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 gtex:breast.mammary.… "" 1852. 515. 3383. 1015. 78.3 53.2
## 2 gtex:breast.mammary.… "" 1711. 465. 3366. 1022. 68.6 48.7
## 3 gtex:lung "" 1815. 537. 3269. 974. 75.4 54.9
## 4 gtex:skin.sun.expose… "" 1638. 442. 3405. 908. 71.2 48.2
## 5 gtex:thyroid "" 1546. 476. 3175. 1033. 68.1 45.4
## 6 gtex:whole.blood "" 2400. 335. 3268. 403. 105. 78.8
## 7 scipher:scipher.comp… "" 429. 111. 972. 269. 49.1 42.6
## 8 scipher:scipher.comp… "" 448. 116. 1007. 280. 48.6 42.8
## 9 scipher:scipher.comp… "" 437. 111. 990. 277. 48.0 41.2
## 10 scipher:scipher.samp… "" 394. 104. 953. 261. 46.5 40.7
## # … with abbreviated variable names ¹subclassification, ²median_sd, ³mean_mean,
## # ⁴median_mean, ⁵median_cv
# Calculate number of genes above certain SD, mean and CV cut-offs
sd_genes_vs_a_df = sd_genes_df %>%
filter(type_counts == type_counts_selected) %>%
group_by(dataset,
type_dataset,
dataset_name,
type_counts,
sex,
subclassification) %>%
summarize(num_genes_above_sd_cut = length(sd[sd >= sd_cut]),
num_genes_above_mean_cut = length(mean[mean >= mean_cut]),
num_genes_above_cv_cut = length(cv[cv >= cv_cut]),
num_genes = n()) %>%
ungroup() %>%
mutate(fraction_genes_above_sd_cut = num_genes_above_sd_cut / num_genes,
fraction_genes_above_mean_cut = num_genes_above_mean_cut / num_genes,
fraction_genes_above_cv_cut = num_genes_above_cv_cut / num_genes) %>%
inner_join((a_vs_fraction_corr_df %>% select(-type_dataset)),
by = c("dataset_name", "dataset", "sex", "subclassification")) %>%
select(!c("type_correlation",
"correlation",
"sample_size_statistical",
"sample_size_statistical_corrected",
"num_edges_from_statistical_corrected",
"num_edges_from_statistical_corrected_norm")) %>%
unique() %>%
as.data.frame()
## `summarise()` has grouped output by 'dataset', 'type_dataset', 'dataset_name',
## 'type_counts', 'sex'. You can override using the `.groups` argument.
head(sd_genes_vs_a_df)
## dataset type_dataset dataset_name
## 1 gtex Breast.Mammary.Tissue gtex:breast.mammary.tissue
## 2 gtex Breast.Mammary.Tissue gtex:breast.mammary.tissue_female
## 3 gtex Lung gtex:lung
## 4 gtex Skin.Sun.Exposed.Lower.leg gtex:skin.sun.exposed.lower.leg
## 5 gtex Thyroid gtex:thyroid
## 6 gtex Whole.Blood gtex:whole.blood
## type_counts sex subclassification num_genes_above_sd_cut
## 1 reads 17266
## 2 reads female 17455
## 3 reads 17469
## 4 reads 17083
## 5 reads 17197
## 6 reads 14927
## num_genes_above_mean_cut num_genes_above_cv_cut num_genes
## 1 17407 9530 17777
## 2 17684 8796 18132
## 3 17623 10459 18030
## 4 17279 8427 17804
## 5 17411 7840 17870
## 6 14868 14607 15148
## fraction_genes_above_sd_cut fraction_genes_above_mean_cut
## 1 0.9712550 0.9791866
## 2 0.9626627 0.9752923
## 3 0.9688852 0.9774265
## 4 0.9595035 0.9705122
## 5 0.9623391 0.9743145
## 6 0.9854106 0.9815157
## fraction_genes_above_cv_cut model a
## 1 0.5360860 Stretched exponential (by linear fit) 1.720270
## 2 0.4851092 Stretched exponential (by linear fit) 1.811947
## 3 0.5800887 Stretched exponential (by linear fit) 1.654318
## 4 0.4733206 Stretched exponential (by linear fit) 1.836919
## 5 0.4387241 Stretched exponential (by linear fit) 1.746114
## 6 0.9642857 Stretched exponential (by linear fit) 1.650687
## b L max_value_in_dataset
## 1 43.15181 2.114016 74740207
## 2 64.89496 3.885985 42299605
## 3 28.07262 1.964261 82744314
## 4 69.54141 1.443081 109822205
## 5 46.00874 1.669286 95645379
## 6 21.33857 1.586486 72312862
## formula adj.r.squared
## 1 F(x) = 2.11 * exp[(43.15 * x**((1.72) + 1)) / ((1.72) + 1) ] 0.9871514
## 2 F(x) = 3.89 * exp[(64.89 * x**((1.81) + 1)) / ((1.81) + 1) ] 0.9963943
## 3 F(x) = 1.96 * exp[(28.07 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9838666
## 4 F(x) = 1.44 * exp[(69.54 * x**((1.84) + 1)) / ((1.84) + 1) ] 0.9921006
## 5 F(x) = 1.67 * exp[(46.01 * x**((1.75) + 1)) / ((1.75) + 1) ] 0.9898823
## 6 F(x) = 1.59 * exp[(21.34 * x**((1.65) + 1)) / ((1.65) + 1) ] 0.9832295
## relative.error.mean unnorm_L total_num_edges density L_vs_total
## 1 0.1889067 158001976 158001976 1 0
## 2 0.3086317 164375646 164375646 1 0
## 3 0.2831612 162531435 162531435 1 0
## 4 0.1308873 158482306 158482306 1 0
## 5 0.1486842 159659515 159659515 1 0
## 6 0.1622811 114723378 114723378 1 0
Plot different parameters (sd, mean, cv) vs. alpha:
parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
"gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:skin.sun.exposed.lower.leg",
"gtex:thyroid",
"gtex:whole.blood",
"tcga:tcga-brca-tumor_female",
"tcga:tcga-kirc-tumor",
"tcga:tcga-kirp-tumor",
"tcga:tcga-luad-tumor",
"tcga:tcga-lusc-tumor",
"tcga:tcga-thca-tumor",
"tcga:breast-tissue_female",
"tcga:kidney-tissue",
"tcga:lung-tissue",
"tcga:brca.luma-subtype",
"tcga:brca.lumb-subtype")
# Prepare table for plots
sd_genes_vs_a_filtered = sd_genes_vs_a_df %>%
filter(type_counts == type_counts_selected) %>%
filter(dataset_name %in% datasets_selected) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "scipher:scipher.sample.per.patient.baseline",
"R. arthritis")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:whole.blood",
"GTEx: Whole blood")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:lung",
"GTEx: Lung")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-luad-tumor",
"TCGA: Lung cancer")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-brca-tumor_female",
"TCGA: Breast cancer")) %>%
left_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col" = ".") %>%
tibble::rownames_to_column("dataset_name")),
by = c("dataset_name") ) %>%
mutate(rgb_col =
replace(rgb_col,
is.na(rgb_col),
"#000000")) %>% # Non-selected datasets with color black
mutate(geom_text_repel_label =
dplyr::if_else(dataset_name == "R. arthritis",
"R. arthritis",
dplyr::if_else(dataset_name == "GTEx: Whole blood",
"GTEx: Whole blood",
dplyr::if_else(dataset_name == "GTEx: Lung",
"GTEx: Lung", dplyr::if_else(dataset_name == "GTEx: Breast",
"GTEx: Breast",
dplyr::if_else(dataset_name == "TCGA: Lung cancer",
"TCGA: Lung cancer",
dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", ""))))))) %>%
as.data.frame()
scatter_plots = list()
print(unique(sd_genes_vs_a_filtered$dataset_name)) # test to check if datasets are selected correctly
## [1] "GTEx: Breast" "GTEx: Lung"
## [3] "gtex:skin.sun.exposed.lower.leg" "gtex:thyroid"
## [5] "GTEx: Whole blood" "R. arthritis"
## [7] "tcga:brca.luma-subtype" "tcga:brca.lumb-subtype"
## [9] "tcga:breast-tissue_female" "tcga:kidney-tissue"
## [11] "tcga:lung-tissue" "TCGA: Breast cancer"
## [13] "tcga:tcga-kirc-tumor" "tcga:tcga-kirp-tumor"
## [15] "TCGA: Lung cancer" "tcga:tcga-lusc-tumor"
## [17] "tcga:tcga-thca-tumor"
# Make plot for each type of parameter (SD, mean, CV)
for (x in 1:length(parameters)){
parameter = parameters[x]
if (parameter == "cv"){
cut <- cv_cut
x_legend <- 1.55
y_legend <- 0.45
} else if (parameter == "sd"){
cut <- sd_cut
x_legend <- 1.585
y_legend <- 0.938
} else if (parameter == "mean"){
cut <- mean_cut
x_legend <- 1.585
y_legend <- 0.968
}
# Calculate correlation
ss_vs_a_cor <- cor(sd_genes_vs_a_filtered$a,
sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
parameter,
"_cut",
sep = "")]])
# Calculate regression
lm_summary <- summary(lm(sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
parameter,
"_cut",
sep = "")]] ~
sd_genes_vs_a_filtered$a))
slope <- coef(lm_summary)[2]
intercept <- coef(lm_summary)[1]
adj.r.squared <- lm_summary$adj.r.squared
lm_cor_result <- paste("R² =",
round(adj.r.squared, 3),
"\ncorr. =",
round(ss_vs_a_cor, 3))
param_vs_a_plot = sd_genes_vs_a_filtered %>%
ggplot(aes(x = a,
y = .data[[paste("fraction_genes_above_",
parameter,
"_cut",
sep = "")]])) +
geom_point(aes(col=rgb_col), show.legend = FALSE) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope,
intercept = intercept,
col = lm_cor_result),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
annotate("text", x = x_legend, y = y_legend, label = lm_cor_result, size = 5) +
xlab(expression(alpha)) +
ylab(paste("Frac. genes with ",
parameter2label[[parameter]],
" above ",
cut,
sep = "")) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
#legend.position = c(0.70, 0.25),
#legend.text = element_text(size = 14),
#legend.title=element_blank())
plot_file = paste(plots_dir, "/a_vs_fraction_genes_above_", parameter, "_cut.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 6000,
#height = 6000,
units = c("px")
)
print(param_vs_a_plot)
scatter_plots[[x]] = param_vs_a_plot
}
## Saving 6000 x 6000 px image
## Saving 6000 x 6000 px image
## Saving 6000 x 6000 px image
With less datasets:
parameters = c("sd", "mean", "cv")
parameter2label <- list("sd"="SD", "mean"="mean", "cv"="CV")
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
"gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:skin.sun.exposed.lower.leg",
"gtex:thyroid",
"gtex:whole.blood",
"tcga:tcga-brca-tumor_female",
"tcga:tcga-kirc-tumor",
"tcga:tcga-kirp-tumor",
"tcga:tcga-luad-tumor",
"tcga:tcga-lusc-tumor",
"tcga:tcga-thca-tumor",
"tcga:breast-tissue_female",
"tcga:kidney-tissue",
"tcga:lung-tissue",
"tcga:brca.luma-subtype",
"tcga:brca.lumb-subtype")
# Prepare table for plots
sd_genes_vs_a_filtered = sd_genes_vs_a_df %>%
filter(dataset_name %in% datasets_selected) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-brca-tumor_female",
"TCGA: Breast cancer")) %>%
left_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("dataset") #%>%
#filter(dataset_name %in% c("TCGA: Breast cancer", "GTEx: Breast"))
),
by=c("dataset") ) %>%
mutate(rgb_col = replace(rgb_col,
is.na(rgb_col),
"#000000")) %>% # Non-selected datasets with color black
mutate(geom_text_repel_label =
dplyr::if_else(dataset_name == "R. arthritis",
"R. arthritis",
dplyr::if_else(dataset_name == "GTEx: Whole blood",
"GTEx: Whole blood",
dplyr::if_else(dataset_name == "GTEx: Lung",
"GTEx: Lung",
dplyr::if_else(dataset_name == "GTEx: Breast",
"GTEx: Breast", dplyr::if_else(dataset_name == "TCGA: Lung cancer",
"TCGA: Lung cancer",
dplyr::if_else(dataset_name == "TCGA: Breast cancer",
"TCGA: Breast cancer",
""))))))) %>%
as.data.frame()
scatter_plots_lessdatasets = list()
lm_summary_avspar = list()
lm_cor_result_avspar = list()
for (x in 1:length(parameters)){
parameter = parameters[x]
if (parameter == "cv"){
cut <- cv_cut
x_legend <- 2
y_legend <- 0.38
} else if (parameter == "sd"){
cut <- sd_cut
x_legend <- 1.585
y_legend <- 0.938
} else if (parameter == "mean"){
cut <- mean_cut
x_legend <- 1.585
y_legend <- 0.968
}
# Calculate correlation
a_vs_par_cor <- cor(sd_genes_vs_a_filtered$a,
sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
parameter,
"_cut",
sep = "")]])
# Calculate regression
lm_summary_avspar[[x]] <- summary(lm(sd_genes_vs_a_filtered[[paste("fraction_genes_above_",
parameter,
"_cut",
sep = "")]] ~
sd_genes_vs_a_filtered$a))
lm_cor_result_avspar[[x]] <- paste("R² =",
round(lm_summary_avspar[[x]]$adj.r.squared, 3),
"\ncorr. =",
round(a_vs_par_cor, 3))
scatter_plots_lessdatasets[[x]] <- sd_genes_vs_a_filtered %>%
ggplot(aes(x = a,
y = .data[[paste("fraction_genes_above_",
parameter,
"_cut",
sep = "")]],
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col=rgb_col, label=geom_text_repel_label),
fill="white",
max.iter = 1e6,
label.size = 0.75,
size = 5,
alpha = 0.6,
box.padding = 0.5,
label.padding = 0.25,
fontface = "bold",
na.rm=TRUE,
seed = 1234) +
geom_abline(aes(slope = coef(lm_summary_avspar[[x]])[2],
intercept = coef(lm_summary_avspar[[x]])[1]),
col = "gray50",
linetype = "dashed",
show.legend = FALSE) +
annotate("text", x = x_legend, y = y_legend, label = lm_cor_result_avspar[[x]], size = 5) +
xlab(expression(alpha)) +
ylab(paste("Frac. genes with ",
parameter2label[[parameter]],
" above ",
cut,
sep = "")) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
plot_file = paste(plots_dir,
"/a_vs_fraction_genes_above_",
parameter,
"_cut_lessdatasets.png",
sep = "")
ggsave(
plot_file,
plot = scatter_plots_lessdatasets[[x]],
dpi = 1200,
width = 6000,
#height = 6000,
units = c("px")
)
print(scatter_plots_lessdatasets[[x]])
}
## Saving 6000 x 6000 px image
## Saving 6000 x 6000 px image
## Saving 6000 x 6000 px image
Make figure for Supplementary Information:
layout <- "
ABC
DFG
"
((distribution_plots_lessdatasets[[2]] + labs(tag = 'A')) +
(distribution_plots_lessdatasets[[1]] + labs(tag = 'B')) +
(distribution_plots_lessdatasets[[3]] + labs(tag = 'C')) +
((scatter_plots_lessdatasets[[2]] +
geom_abline(aes(slope = coef(lm_summary_avspar[[2]])[2],
intercept = coef(lm_summary_avspar[[2]])[1]),
col = "gray50",
linetype = "dashed",
show.legend = FALSE)) + labs(tag = 'D')) +
((scatter_plots_lessdatasets[[1]] +
geom_abline(aes(slope = coef(lm_summary_avspar[[1]])[2],
intercept = coef(lm_summary_avspar[[1]])[1]),
col = "gray50",
linetype = "dashed",
show.legend = FALSE)) + labs(tag = 'E')) +
(scatter_plots_lessdatasets[[3]] + labs(tag = 'F'))) +
plot_layout(design = layout) &
theme(plot.tag = element_text(face = 'bold', size = 17),
plot.tag.position = c(.1, 1.02),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
plot_file = paste(plots_dir, "/cv_results_SI.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 17500,
#height = 12500,
height = 12000,
units = c("px")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
We calculate the sample size necessary to reveal 50% of the significant correlations:
calculate_optimal_N <- function(L, a, b, s, N_guess=c(10000)) {
optimize_N = function(par){
N = par[1]
f = s - L * exp((b * N ** (-a + 1)) / (-a + 1))
return((abs(f)))
}
res = optim(par=N_guess, fn=optimize_N, method="Brent", lower=3, upper=50000)
return(res$par)
}
model_selected = "Stretched exponential (by linear fit)"
cols = c("dataset_name", "s", "a", "N_opt")
fraction_links <- 0.5
N_opt_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for (dataset_selected in unique(analytical_model_summary_df$dataset_name)){
analytical_params_selected = analytical_model_summary_df %>%
filter((dataset_name == dataset_selected) & (model == model_selected))
N_opt <- calculate_optimal_N(L = analytical_params_selected$L,
a = analytical_params_selected$a,
b = analytical_params_selected$b,
s = fraction_links * analytical_params_selected$L)
# Why multiply 0.5 x L? Because the normalized result (e.g., 0.5) is the result of:
# 0.5 = (s * max_value_in_dataset / max_value_in_dataset) / L = s / L
# --> s = 0.5 * L
N_opt_df = rbind(N_opt_df,
data.frame(dataset_name = dataset_selected,
s = fraction_links,
a = analytical_params_selected$a,
N_opt = N_opt))
}
N_opt_file = paste(tables_dir,
"/optimal_sample_size_",
fraction_links,
"links",
sep = "")
N_opt_df %>%
fwrite(N_opt_file)
N_opt_df
## dataset_name s a N_opt
## 1 scipher:scipher.complete.dataset 0.5 1.790429 249.47471
## 2 scipher:scipher.sample.per.patient.baseline 0.5 1.758283 321.23471
## 3 gtex:breast.mammary.tissue 0.5 1.720270 488.45289
## 4 gtex:breast.mammary.tissue_female 0.5 1.811947 346.25815
## 5 gtex:lung 0.5 1.654318 547.29617
## 6 gtex:skin.sun.exposed.lower.leg 0.5 1.836919 304.64639
## 7 gtex:thyroid 0.5 1.746114 409.71393
## 8 gtex:whole.blood 0.5 1.650687 375.10439
## 9 tcga:tcga-brca-tumor_female 0.5 1.478753 3914.54702
## 10 tcga:tcga-kirc-tumor 0.5 1.565623 1373.65244
## 11 tcga:tcga-kirp-tumor 0.5 1.668704 741.92255
## 12 tcga:tcga-luad-tumor 0.5 1.649890 899.08309
## 13 tcga:tcga-lusc-tumor 0.5 1.549679 2160.52992
## 14 tcga:tcga-thca-tumor 0.5 1.610579 929.03972
## 15 tcga:breast-tissue_female 0.5 1.836527 314.92674
## 16 tcga:kidney-tissue 0.5 1.771018 384.51866
## 17 tcga:lung-tissue 0.5 2.183346 94.04511
## 18 tcga:brca.luma-subtype 0.5 1.528906 2190.78289
## 19 tcga:brca.lumb-subtype 0.5 1.441041 5950.41238
## 20 gse193677:rectum-disease_inflamed_vs_control_cd 0.5 1.898637 221.26127
## 21 gse193677:rectum-disease_inflamed_vs_control_control 0.5 1.942484 200.27661
## 22 gse193677:rectum-disease_inflamed_vs_control_uc 0.5 2.198803 102.76362
Then, we match the optimal sample size to obtain 50% of significant correlations with the CV of the dataset.
sd_cut <- 10
mean_cut <- 10
cv_cut <- 50
datasets_selected <- c("scipher:scipher.sample.per.patient.baseline",
"gtex:breast.mammary.tissue_female",
"gtex:lung",
"gtex:skin.sun.exposed.lower.leg",
"gtex:thyroid",
"gtex:whole.blood",
"tcga:tcga-brca-tumor_female",
"tcga:tcga-kirc-tumor",
"tcga:tcga-kirp-tumor",
"tcga:tcga-luad-tumor",
"tcga:tcga-lusc-tumor",
"tcga:tcga-thca-tumor",
"tcga:breast-tissue_female",
"tcga:kidney-tissue",
"tcga:lung-tissue",
"tcga:brca.luma-subtype",
"tcga:brca.lumb-subtype")
sd_genes_vs_ss_df <- sd_genes_df %>%
filter(dataset_name %in% datasets_selected) %>%
group_by(dataset, type_dataset, dataset_name) %>%
summarize(num_genes_above_sd_cut=length(sd[sd >= sd_cut]),
num_genes_above_mean_cut=length(mean[mean >= mean_cut]),
num_genes_above_cv_cut=length(cv[cv >= cv_cut]),
num_genes_total=n()
) %>%
ungroup() %>%
mutate(fraction_genes_above_sd_cut = num_genes_above_sd_cut / num_genes_total,
fraction_genes_above_mean_cut = num_genes_above_mean_cut / num_genes_total,
fraction_genes_above_cv_cut = num_genes_above_cv_cut / num_genes_total) %>%
inner_join(N_opt_df, by=c("dataset_name")) %>%
unique() %>%
as.data.frame()
## `summarise()` has grouped output by 'dataset', 'type_dataset'. You can override
## using the `.groups` argument.
sd_genes_vs_ss_file = paste(tables_dir,
"/sd_vs_optimal_sample_size_",
fraction_links,
"links",
sep = "")
sd_genes_vs_ss_df %>%
fwrite(sd_genes_vs_ss_file)
# Modify dataset names for the plots
sd_genes_vs_ss_df = sd_genes_vs_ss_df %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "tcga:tcga-brca-tumor_female",
"TCGA: Breast cancer")) %>%
mutate(dataset_name =
replace(dataset_name,
dataset_name == "gtex:breast.mammary.tissue_female",
"GTEx: Breast")) %>%
# mutate(dataset_name =
# replace(dataset_name,
# dataset_name == "scipher:scipher.sample.per.patient.baseline",
# "R. arthritis")) %>%
# mutate(dataset_name =
# replace(dataset_name,
# dataset_name == "gtex:whole.blood",
# "GTEx: Whole blood")) %>%
# #mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:thyroid", "GTEx: Thyroid")) %>%
# mutate(dataset_name =
# replace(dataset_name,
# dataset_name == "gtex:lung",
# "GTEx: Lung")) %>%
# #mutate(dataset_name = replace(dataset_name, dataset_name == "gtex:artery.tibial", "GTEx: Artery tibial")) %>%
# #mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-coad", "TCGA: Colon cancer")) %>%
# #mutate(dataset_name = replace(dataset_name, dataset_name == "tcga:tcga-thca", "TCGA: Thyroid cancer")) %>%
# mutate(dataset_name =
# replace(dataset_name,
# dataset_name == "tcga:tcga-luad-tumor",
# "TCGA: Lung cancer")) %>%
left_join( (name2color %>%
as.data.frame() %>%
dplyr::rename("rgb_col"=".") %>%
tibble::rownames_to_column("dataset") #%>%
#filter(dataset_name %in% c("TCGA: Breast cancer", "GTEx: Breast"))
),
by=c("dataset") ) %>%
mutate(rgb_col = replace(rgb_col, is.na(rgb_col), "#000000")) %>% # Non-selected datasets with color black
mutate(geom_text_repel_label=dplyr::if_else(dataset_name == "R. arthritis", "R. arthritis",
dplyr::if_else(dataset_name == "GTEx: Whole blood", "GTEx: Whole blood",
dplyr::if_else(dataset_name == "GTEx: Lung", "GTEx: Lung",
dplyr::if_else(dataset_name == "GTEx: Breast", "GTEx: Breast",
dplyr::if_else(dataset_name == "TCGA: Lung cancer", "TCGA: Lung cancer",
dplyr::if_else(dataset_name == "TCGA: Breast cancer", "TCGA: Breast cancer", "")))))))
sd_genes_vs_ss_df
## dataset type_dataset
## 1 gtex Breast.Mammary.Tissue
## 2 gtex Lung
## 3 gtex Skin.Sun.Exposed.Lower.leg
## 4 gtex Thyroid
## 5 gtex Whole.Blood
## 6 scipher scipher.sample.per.patient.baseline
## 7 tcga BRCA.LumA
## 8 tcga BRCA.LumB
## 9 tcga Breast
## 10 tcga Kidney
## 11 tcga Lung
## 12 tcga TCGA-BRCA
## 13 tcga TCGA-KIRC
## 14 tcga TCGA-KIRP
## 15 tcga TCGA-LUAD
## 16 tcga TCGA-LUSC
## 17 tcga TCGA-THCA
## dataset_name num_genes_above_sd_cut
## 1 GTEx: Breast 17455
## 2 gtex:lung 17469
## 3 gtex:skin.sun.exposed.lower.leg 17083
## 4 gtex:thyroid 17197
## 5 gtex:whole.blood 14927
## 6 scipher:scipher.sample.per.patient.baseline 12480
## 7 tcga:brca.luma-subtype 18712
## 8 tcga:brca.lumb-subtype 18503
## 9 tcga:breast-tissue_female 18564
## 10 tcga:kidney-tissue 18302
## 11 tcga:lung-tissue 18160
## 12 TCGA: Breast cancer 18804
## 13 tcga:tcga-kirc-tumor 18776
## 14 tcga:tcga-kirp-tumor 18098
## 15 tcga:tcga-luad-tumor 18965
## 16 tcga:tcga-lusc-tumor 19175
## 17 tcga:tcga-thca-tumor 18081
## num_genes_above_mean_cut num_genes_above_cv_cut num_genes_total
## 1 17684 8796 18132
## 2 17623 10459 18030
## 3 17279 8427 17804
## 4 17411 7840 17870
## 5 14868 14607 15148
## 6 13189 4163 13757
## 7 18777 13892 19181
## 8 18629 15160 19017
## 9 19107 10259 19826
## 10 18811 9001 19424
## 11 18484 13582 19141
## 12 18804 16231 19118
## 13 18888 13129 19310
## 14 18214 14830 18549
## 15 18950 18700 19200
## 16 19239 17100 19537
## 17 18330 10249 18827
## fraction_genes_above_sd_cut fraction_genes_above_mean_cut
## 1 0.9626627 0.9752923
## 2 0.9688852 0.9774265
## 3 0.9595035 0.9705122
## 4 0.9623391 0.9743145
## 5 0.9854106 0.9815157
## 6 0.9071745 0.9587119
## 7 0.9755487 0.9789375
## 8 0.9729716 0.9795972
## 9 0.9363462 0.9637345
## 10 0.9422364 0.9684411
## 11 0.9487488 0.9656758
## 12 0.9835757 0.9835757
## 13 0.9723459 0.9781460
## 14 0.9756860 0.9819397
## 15 0.9877604 0.9869792
## 16 0.9814711 0.9847469
## 17 0.9603761 0.9736017
## fraction_genes_above_cv_cut s a N_opt rgb_col
## 1 0.4851092 0.5 1.811947 346.25815 #00BA37
## 2 0.5800887 0.5 1.654318 547.29617 #00BA37
## 3 0.4733206 0.5 1.836919 304.64639 #00BA37
## 4 0.4387241 0.5 1.746114 409.71393 #00BA37
## 5 0.9642857 0.5 1.650687 375.10439 #00BA37
## 6 0.3026096 0.5 1.758283 321.23471 #D55E00
## 7 0.7242584 0.5 1.528906 2190.78289 #0072B2
## 8 0.7971815 0.5 1.441041 5950.41238 #0072B2
## 9 0.5174518 0.5 1.836527 314.92674 #0072B2
## 10 0.4633958 0.5 1.771018 384.51866 #0072B2
## 11 0.7095763 0.5 2.183346 94.04511 #0072B2
## 12 0.8489905 0.5 1.478753 3914.54702 #0072B2
## 13 0.6799068 0.5 1.565623 1373.65244 #0072B2
## 14 0.7995040 0.5 1.668704 741.92255 #0072B2
## 15 0.9739583 0.5 1.649890 899.08309 #0072B2
## 16 0.8752623 0.5 1.549679 2160.52992 #0072B2
## 17 0.5443778 0.5 1.610579 929.03972 #0072B2
## geom_text_repel_label
## 1 GTEx: Breast
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12 TCGA: Breast cancer
## 13
## 14
## 15
## 16
## 17
We plot the results of log10 alpha vs log10 sample size:
# Calculate correlation
ss_vs_a_cor <- cor(log10(sd_genes_vs_ss_df$N_opt),
log10(sd_genes_vs_ss_df$a))
# Calculate regression
lm_summary_ssvsa <- summary(lm(log10(sd_genes_vs_ss_df$a) ~
log10(sd_genes_vs_ss_df$N_opt)))
slope_ssvsa <- coef(lm_summary_ssvsa)[2]
intercept_ssvsa <- coef(lm_summary_ssvsa)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =",
round(adj.r.squared_ssvsa, 3),
"\ncorr. =",
round(ss_vs_a_cor, 3))
a_vs_ss_plot <- sd_genes_vs_ss_df %>%
ggplot(aes(x = log10(N_opt),
y = log10(a),
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
max.iter = 1e5,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding = 0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope_ssvsa,
intercept = intercept_ssvsa,
col = lm_cor_result_ssvsa),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
#annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
annotate("text", x = 3.3, y = 0.32, label = lm_cor_result_ssvsa, size = 5) +
xlab("log10(n) to get 50% sig. links") +
ylab(expression(bold(log10(alpha)))) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
plot_file = paste(plots_dir, "/ss_vs_a.png", sep="")
ggsave(
plot_file,
plot = a_vs_ss_plot,
dpi = 1200,
width = 8000,
#height = 6000,
units = c("px")
)
## Saving 8000 x 6000 px image
print(a_vs_ss_plot)
We plot the results of alpha vs sample size (lin-lin):
# Calculate correlation
ss_vs_a_cor <- cor(sd_genes_vs_ss_df$N_opt,
sd_genes_vs_ss_df$a)
# Calculate regression
lm_summary_ssvsa_linlin <- summary(lm(sd_genes_vs_ss_df$a ~
sd_genes_vs_ss_df$N_opt))
slope_ssvsa_linlin <- coef(lm_summary_ssvsa_linlin)[2]
intercept_ssvsa_linlin <- coef(lm_summary_ssvsa_linlin)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa_linlin$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =",
round(adj.r.squared_ssvsa, 3),
"\ncorr. =",
round(ss_vs_a_cor, 3))
a_vs_ss_lin_lin_plot <- sd_genes_vs_ss_df %>%
ggplot(aes(x = N_opt,
y = a,
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
max.iter = 1e5,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope_ssvsa_linlin,
intercept = intercept_ssvsa_linlin,
col = lm_cor_result_ssvsa),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
#annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
annotate("text", x = 4000, y = 2, label = lm_cor_result_ssvsa, size = 5) +
xlab("n to get 50% sig. links") +
ylab(expression(bold(alpha))) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
print(a_vs_ss_lin_lin_plot)
We plot the results of log10 alpha vs sample size:
# Calculate correlation
ss_vs_a_cor <- cor(sd_genes_vs_ss_df$N_opt,
log10(sd_genes_vs_ss_df$a))
# Calculate regression
lm_summary_ssvsa_loglin <- summary(lm(log10(sd_genes_vs_ss_df$a) ~
sd_genes_vs_ss_df$N_opt))
slope_ssvsa_loglin <- coef(lm_summary_ssvsa_loglin)[2]
intercept_ssvsa_loglin <- coef(lm_summary_ssvsa_loglin)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa_loglin$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =",
round(adj.r.squared_ssvsa, 3),
"\ncorr. =",
round(ss_vs_a_cor, 3))
a_vs_ss_log_lin_plot <- sd_genes_vs_ss_df %>%
ggplot(aes(x = N_opt,
y = log10(a),
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope_ssvsa_loglin,
intercept = intercept_ssvsa_loglin,
col = lm_cor_result_ssvsa),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
#annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
annotate("text", x = 4000, y = 0.3, label = lm_cor_result_ssvsa, size = 5) +
xlab("n to get 50% sig. links") +
ylab(expression(bold(log10(alpha)))) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
print(a_vs_ss_log_lin_plot)
We plot the results of alpha vs log10 sample size:
# Calculate correlation
ss_vs_a_cor <- cor(log10(sd_genes_vs_ss_df$N_opt),
sd_genes_vs_ss_df$a)
# Calculate regression
lm_summary_ssvsa_linlog <- summary(lm(sd_genes_vs_ss_df$a ~
log10(sd_genes_vs_ss_df$N_opt)))
slope_ssvsa_linlog <- coef(lm_summary_ssvsa_linlog)[2]
intercept_ssvsa_linlog <- coef(lm_summary_ssvsa_linlog)[1]
adj.r.squared_ssvsa <- lm_summary_ssvsa_linlog$adj.r.squared
lm_cor_result_ssvsa <- paste("R² =",
round(adj.r.squared_ssvsa, 3),
"\ncorr. =",
round(ss_vs_a_cor, 3))
a_vs_ss_lin_log_plot <- sd_genes_vs_ss_df %>%
ggplot(aes(x = log10(N_opt),
y = a,
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
max.iter = 1e5,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope_ssvsa_linlog,
intercept = intercept_ssvsa_linlog,
col = lm_cor_result_ssvsa),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
#annotate("text", x = 4500, y = 2.05, label = lm_cor_result_ssvsa, size = 5) +
annotate("text", x = 3.3, y = 2, label = lm_cor_result_ssvsa, size = 5) +
xlab("log10(n) to get 50% sig. links") +
ylab(expression(bold(alpha))) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
print(a_vs_ss_lin_log_plot)
We plot all results of alpha vs. sample size together:
layout <- "
AB
CD
"
((a_vs_ss_lin_lin_plot +
labs(tag = 'A') +
theme(plot.tag.position = c(.07, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(a_vs_ss_lin_log_plot +
labs(tag = 'B') +
theme(plot.tag.position = c(.07, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(a_vs_ss_log_lin_plot +
labs(tag = 'C') +
theme(plot.tag.position = c(.07, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(a_vs_ss_plot +
labs(tag = 'D') +
theme(plot.tag.position = c(.07, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
)) +
plot_layout(design = layout) &
theme(plot.tag = element_text(face = 'bold', size = 17))
plot_file = paste(plots_dir, "/ss_vs_a_all_combinations.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 12000,
height = 11000,
units = c("px")
)
We plot the results of CV vs sample size:
# Calculate correlation
cv_vs_ss_cor <- cor(sd_genes_vs_ss_df$N_opt,
sd_genes_vs_ss_df$fraction_genes_above_cv_cut)
# Calculate regression
lm_summary_cvvsss <- summary(lm(sd_genes_vs_ss_df$fraction_genes_above_cv_cut ~
sd_genes_vs_ss_df$N_opt))
slope_cvvsss <- coef(lm_summary_cvvsss)[2]
intercept_cvvsss <- coef(lm_summary_cvvsss)[1]
adj.r.squared_cvvsss <- lm_summary_cvvsss$adj.r.squared
lm_cor_result_cvvsss <- paste("R² =",
round(adj.r.squared_cvvsss, 3),
"\ncorr. =",
round(cv_vs_ss_cor, 3))
cv_vs_ss_plot <- sd_genes_vs_ss_df %>%
ggplot(aes(x = N_opt,
y = fraction_genes_above_cv_cut,
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope_cvvsss,
intercept = intercept_cvvsss,
col = lm_cor_result_cvvsss),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
annotate("text", x = 4500, y = 0.41, label = lm_cor_result_cvvsss, size = 5) +
xlab("Num. samples for 50% sig. links") +
ylab(paste("Frac. genes with CV above ",
cv_cut,
sep = "")) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
plot_file = paste(plots_dir, "/ss_vs_fraction_genes_above_cv_cut.png", sep="")
ggsave(
plot_file,
plot = cv_vs_ss_plot,
dpi = 1200,
width = 8000,
#height = 6000,
units = c("px")
)
## Saving 8000 x 6000 px image
print(cv_vs_ss_plot)
We plot the results of SD vs sample size:
# Calculate correlation
sd_vs_ss_cor <- cor(sd_genes_vs_ss_df$N_opt,
sd_genes_vs_ss_df$fraction_genes_above_sd_cut)
# Calculate regression
lm_summary_sdvsss <- summary(lm(sd_genes_vs_ss_df$fraction_genes_above_sd_cut ~
sd_genes_vs_ss_df$N_opt))
slope_sdvsss <- coef(lm_summary_sdvsss)[2]
intercept_sdvsss <- coef(lm_summary_sdvsss)[1]
adj.r.squared_sdvsss <- lm_summary_sdvsss$adj.r.squared
lm_cor_result_sdvsss <- paste("R² =",
round(adj.r.squared_sdvsss, 3),
"\ncorr. =",
round(sd_vs_ss_cor, 3))
sd_vs_ss_plot <- sd_genes_vs_ss_df %>%
ggplot(aes(x = N_opt,
y = fraction_genes_above_sd_cut,
col = rgb_col)) +
geom_point(alpha = 0.6, size = 3) +
scale_colour_identity() +
geom_label_repel(aes(col = rgb_col,
label = geom_text_repel_label),
fill = "white",
box.padding = 0.5,
#max.overlaps = Inf,
max.iter = 1e5,
label.size = 0.75,
size = 5,
alpha = 0.6,
label.padding=0.25,
fontface = "bold",
na.rm = TRUE,
seed = 1234) +
new_scale_color() +
geom_abline(aes(slope = slope_sdvsss,
intercept = intercept_sdvsss,
col = lm_cor_result_sdvsss),
linetype = "dashed",
show.legend = FALSE) +
scale_color_manual(values = c("gray50")) +
annotate("text", x = 3500, y = 0.93, label = lm_cor_result_sdvsss, size = 5) +
xlab("Num. samples for 50% sig. links") +
ylab(paste("Frac. genes with SD above ",
sd_cut,
sep = "")) +
theme_linedraw() +
theme(aspect.ratio = 1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none")
plot_file = paste(plots_dir, "/ss_vs_fraction_genes_above_sd_cut.png", sep="")
ggsave(
plot_file,
plot = sd_vs_ss_plot,
dpi = 1200,
width = 8000,
#height = 6000,
units = c("px")
)
## Saving 8000 x 6000 px image
print(sd_vs_ss_plot)
Create a panel with all the results:
# # Use patchwork to concatenate plots
# (((curve_plot + labs(tag = 'A')) |
# (scaling_relation_plot + labs(tag = 'B')) |
# (comparison_plot + labs(tag = 'C'))) /
# (wrap_elements(figure_summary_table_plot) + labs(tag='D')) /
# ((prediction_plot + labs(tag = 'E')) |
# (a_vs_fraction_sig_correlations_plot + labs(tag = 'F')) |
# (scatter_plots[[3]] + labs(tag='G')))) &
# theme(element_text(face = 'bold', size = 17)) # bold tags
layout <- "
ABC
DDD
EFG
"
((goodnessfit_plot_lessdatasets +
labs(tag = 'A') +
theme(plot.tag.position = c(.09, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(scaling_relation_plot_lessdatasets +
labs(tag = 'B') +
theme(plot.tag.position = c(.07, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(prediction_plot_lessdatasets +
labs(tag = 'C') +
theme(plot.tag.position = c(.07, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(wrap_elements(figure_summary_table_plot) +
labs(tag='D') +
theme(plot.tag.position = c(.20, .87),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(a_vs_fraction_sig_correlations_lessdatasets_plot +
labs(tag = 'E') +
theme(plot.tag.position = c(.09, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(scatter_plots_lessdatasets[[3]] +
labs(tag = 'F') +
theme(plot.tag.position = c(.04, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
(a_vs_ss_lin_log_plot +
labs(tag='G')
) +
theme(plot.tag.position = c(.04, 1.04),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
) +
plot_layout(design = layout) &
theme(plot.tag = element_text(face = 'bold', size = 17))
## Warning: Removed 9938 rows containing missing values (`geom_point()`).
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_file = paste(plots_dir, "/modelling_and_cv_results.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 20500,
#height = 12500,
height = 18000,
units = c("px")
)
## Warning: Removed 9938 rows containing missing values (`geom_point()`).
# (((curve_plot +
# labs(tag = 'A') +
# theme(legend.position="none")) |
# (comparison_plot +
# labs(tag = 'D') +
# guides(col=guide_legend(title="Model", title.position="top")) +
# theme(legend.position = c(0.7, 0.24), legend.text = element_text(size = 11), legend.title=element_text(size=13, face="bold"), legend.background = element_rect(size=0.2, linetype="solid", colour ="black"))) |
# (a_vs_fraction_sig_correlations_plot +
# labs(tag = 'F') +
# theme(legend.position="none"))) /
# ((scaling_relation_plot +
# labs(tag = 'B') +
# theme(legend.position="none")) |
# (prediction_plot +
# labs(tag = 'E') +
# #guides(col=guide_legend(title="Dataset", title.position="top", nrow=3,byrow=TRUE)) +
# #theme(legend.position = 'bottom', legend.text = element_text(size = 11), legend.title=element_text(size=13, face="bold"))) |
# theme(legend.position = 'none')) |
# (distribution_plots[[3]] +
# labs(tag = 'G') +
# theme(legend.position="none"))) /
# ((wrap_elements(figure_summary_table_plot) + labs(tag='C')) |
# (scatter_plots[[3]] + labs(tag='H')))) &
# #plot_annotation(tag_levels = 'A') & # Include letters
# theme(element_text(face = 'bold', size = 17)) # that are bold
Define function to calculate critical sample size for correlation:
calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization = function(r, total_num_edges, alpha=0.05, N_guess=c(10000)){
optimize_N = function(par){
N = par[1]
t_guess = qt(alpha, df=N-2, lower.tail = F)
t = r * sqrt((N-2)) / sqrt((1-r**2))
return((abs(t-t_guess)))
}
res = optim(par=N_guess, fn=optimize_N, method="Brent", lower=3, upper=50000)
return(res$par)
}
calculate_sample_size_for_pearson_correlation_using_z_distribution_with_power = function(r, total_num_edges, alpha=0.05, power=0.8){
alpha_corrected = alpha / total_num_edges # Correct alpha by multiple error
Za = qnorm(alpha, lower.tail=F)
Za_corrected = qnorm(alpha_corrected, lower.tail=F)
Zb = qnorm(power, lower.tail=F)
N = ((Za + Zb) / (0.5 * log((1+r)/(1-r))))**2 + 3
N_corrected = ((Za_corrected + Zb) / (0.5 * log((1+r)/(1-r))))**2 + 3
return(list(N=N, N_corrected=N_corrected))
}
calculate_sample_size_for_pearson_correlation_using_z_distribution = function(r, alpha=0.05){
Za = qnorm(alpha, lower.tail=F)
N = r / (2*Za - log((1+r)/(1-r))) + 1
return(N)
}
calculate_sample_size_for_pearson_correlation_using_bonnet_and_wright = function(r, w, alpha=0.05){
Za = qnorm(alpha, lower.tail=F)
N0 = round(4 * (1-r**2)**2 * (Za/w)**2)
L1 = 0.5 * (log(1+r) - log(1-r)) - Za/sqrt(N0-3)
L2 = 0.5 * (log(1+r) - log(1-r)) + Za/sqrt(N0-3)
w0 = ((exp(2*L2) - 1) / (exp(2*L2) + 1)) - ((exp(2*L1) - 1) / (exp(2*L1) + 1))
N = ((N0 - 3) * (w0 / w)**2) + 3
return(N)
}
calculate_predictions_using_stretched_exponential_model_optimized = function(x, L, a, b){
y = L * exp((b*x**(-a+1))/(-a+1))
#y = exp(log(L) - exp(b+a*log(x)))
return(y)
}
Calculate sample size for different levels of correlation and different datasets:
type_correlation_df = data.frame(type_correlation=c("weak", "moderate", "strong", "very strong"), lower_val=c(0.2,0.4,0.6,0.8), upper_val=c(0.4,0.6,0.8,NA))
types_correlation = c("weak", "moderate", "strong", "very strong")
cols = c("dataset", "type_correlation", "correlation", "sample_size_statistical", "sample_size_statistical_corrected", "sample_size_z")
sample_size_correlation_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
for (dataset in numbers_df$dataset){
total_num_edges = (numbers_df %>% filter(dataset == !!dataset))$total_num_edges
for (type_correlation in types_correlation){
r = (type_correlation_df %>% filter(type_correlation == !!type_correlation))$lower_val
N = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05, N_guess=c(10000))
N_corrected = calculate_sample_size_for_pearson_correlation_using_t_distribution_with_optimization(r=r, total_num_edges=total_num_edges, alpha=0.05/total_num_edges, N_guess=c(10000))
N_z = calculate_sample_size_for_pearson_correlation_using_bonnet_and_wright(r=r, w=0.8, alpha=0.05/total_num_edges)
sample_size_correlation_df <- rbind(sample_size_correlation_df, data.frame(dataset=dataset, type_correlation=type_correlation, correlation=r, sample_size_statistical=N, sample_size_statistical_corrected=N_corrected, sample_size_z=N_z))
}
}
print(sample_size_correlation_df)
## dataset type_correlation correlation
## 1 scipher:scipher.complete.dataset weak 0.2
## 2 scipher:scipher.complete.dataset moderate 0.4
## 3 scipher:scipher.complete.dataset strong 0.6
## 4 scipher:scipher.complete.dataset very strong 0.8
## 5 scipher:scipher.sample.per.patient.baseline weak 0.2
## 6 scipher:scipher.sample.per.patient.baseline moderate 0.4
## 7 scipher:scipher.sample.per.patient.baseline strong 0.6
## 8 scipher:scipher.sample.per.patient.baseline very strong 0.8
## 9 tcga:tcga-brca_female weak 0.2
## 10 tcga:tcga-brca_female moderate 0.4
## 11 tcga:tcga-brca_female strong 0.6
## 12 tcga:tcga-brca_female very strong 0.8
## 13 tcga:tcga-kirc weak 0.2
## 14 tcga:tcga-kirc moderate 0.4
## 15 tcga:tcga-kirc strong 0.6
## 16 tcga:tcga-kirc very strong 0.8
## 17 tcga:tcga-kirp weak 0.2
## 18 tcga:tcga-kirp moderate 0.4
## 19 tcga:tcga-kirp strong 0.6
## 20 tcga:tcga-kirp very strong 0.8
## 21 tcga:tcga-luad weak 0.2
## 22 tcga:tcga-luad moderate 0.4
## 23 tcga:tcga-luad strong 0.6
## 24 tcga:tcga-luad very strong 0.8
## 25 tcga:tcga-lusc weak 0.2
## 26 tcga:tcga-lusc moderate 0.4
## 27 tcga:tcga-lusc strong 0.6
## 28 tcga:tcga-lusc very strong 0.8
## 29 tcga:tcga-thca weak 0.2
## 30 tcga:tcga-thca moderate 0.4
## 31 tcga:tcga-thca strong 0.6
## 32 tcga:tcga-thca very strong 0.8
## 33 tcga:breast_female weak 0.2
## 34 tcga:breast_female moderate 0.4
## 35 tcga:breast_female strong 0.6
## 36 tcga:breast_female very strong 0.8
## 37 tcga:kidney weak 0.2
## 38 tcga:kidney moderate 0.4
## 39 tcga:kidney strong 0.6
## 40 tcga:kidney very strong 0.8
## 41 tcga:lung weak 0.2
## 42 tcga:lung moderate 0.4
## 43 tcga:lung strong 0.6
## 44 tcga:lung very strong 0.8
## 45 tcga:brca.luma weak 0.2
## 46 tcga:brca.luma moderate 0.4
## 47 tcga:brca.luma strong 0.6
## 48 tcga:brca.luma very strong 0.8
## 49 tcga:brca.lumb weak 0.2
## 50 tcga:brca.lumb moderate 0.4
## 51 tcga:brca.lumb strong 0.6
## 52 tcga:brca.lumb very strong 0.8
## 53 gtex:breast.mammary.tissue weak 0.2
## 54 gtex:breast.mammary.tissue moderate 0.4
## 55 gtex:breast.mammary.tissue strong 0.6
## 56 gtex:breast.mammary.tissue very strong 0.8
## 57 gtex:breast.mammary.tissue_female weak 0.2
## 58 gtex:breast.mammary.tissue_female moderate 0.4
## 59 gtex:breast.mammary.tissue_female strong 0.6
## 60 gtex:breast.mammary.tissue_female very strong 0.8
## 61 gtex:lung weak 0.2
## 62 gtex:lung moderate 0.4
## 63 gtex:lung strong 0.6
## 64 gtex:lung very strong 0.8
## 65 gtex:skin.sun.exposed.lower.leg weak 0.2
## 66 gtex:skin.sun.exposed.lower.leg moderate 0.4
## 67 gtex:skin.sun.exposed.lower.leg strong 0.6
## 68 gtex:skin.sun.exposed.lower.leg very strong 0.8
## 69 gtex:thyroid weak 0.2
## 70 gtex:thyroid moderate 0.4
## 71 gtex:thyroid strong 0.6
## 72 gtex:thyroid very strong 0.8
## 73 gtex:whole.blood weak 0.2
## 74 gtex:whole.blood moderate 0.4
## 75 gtex:whole.blood strong 0.6
## 76 gtex:whole.blood very strong 0.8
## 77 gse193677:rectum_cd_inflamed weak 0.2
## 78 gse193677:rectum_cd_inflamed moderate 0.4
## 79 gse193677:rectum_cd_inflamed strong 0.6
## 80 gse193677:rectum_cd_inflamed very strong 0.8
## 81 gse193677:rectum_control_noninflamed weak 0.2
## 82 gse193677:rectum_control_noninflamed moderate 0.4
## 83 gse193677:rectum_control_noninflamed strong 0.6
## 84 gse193677:rectum_control_noninflamed very strong 0.8
## 85 gse193677:rectum_uc_inflamed weak 0.2
## 86 gse193677:rectum_uc_inflamed moderate 0.4
## 87 gse193677:rectum_uc_inflamed strong 0.6
## 88 gse193677:rectum_uc_inflamed very strong 0.8
## sample_size_statistical sample_size_statistical_corrected sample_size_z
## 1 68.773025 914.63989 196.99184
## 2 18.002295 216.05522 154.37229
## 3 8.523611 85.91378 97.82438
## 4 5.063346 38.90078 49.91944
## 5 68.773025 914.18952 196.90537
## 6 18.002295 215.94977 154.30369
## 7 8.523611 85.87258 97.77885
## 8 5.063346 38.88278 49.89271
## 9 68.773025 945.64424 203.65958
## 10 18.002295 223.31440 159.55817
## 11 8.523611 88.74980 101.07195
## 12 5.063346 40.13973 51.49231
## 13 68.773025 946.59975 203.84314
## 14 18.002295 223.53812 159.70378
## 15 8.523611 88.83721 101.16863
## 16 5.063346 40.17791 51.54901
## 17 68.773025 942.75523 203.01564
## 18 18.002295 222.63799 159.04099
## 19 8.523611 88.48554 100.75184
## 20 5.063346 40.02428 51.32086
## 21 68.773025 946.05349 203.73821
## 22 18.002295 223.41022 159.62054
## 23 8.523611 88.78724 101.11336
## 24 5.063346 40.15608 51.51660
## 25 68.773025 947.71732 204.05778
## 26 18.002295 223.79978 159.87405
## 27 8.523611 88.93943 101.28168
## 28 5.063346 40.22257 51.61533
## 29 68.773025 944.17760 203.28879
## 30 18.002295 222.97101 159.33462
## 31 8.523611 88.61565 100.89565
## 32 5.063346 40.08112 51.40528
## 33 68.773025 949.12151 204.41649
## 34 18.002295 224.12855 160.08794
## 35 8.523611 89.06788 101.42372
## 36 5.063346 40.27868 51.69864
## 37 68.773025 947.16262 203.95126
## 38 18.002295 223.66991 159.78954
## 39 8.523611 88.88869 101.22557
## 40 5.063346 40.20040 51.58241
## 41 68.773025 945.75920 203.68167
## 42 18.002295 223.34132 159.57569
## 43 8.523611 88.76032 101.08359
## 44 5.063346 40.14432 51.49913
## 45 68.773025 945.95881 203.72002
## 46 18.002295 223.38805 159.60611
## 47 8.523611 88.77858 101.10378
## 48 5.063346 40.15230 51.51098
## 49 68.773025 945.13773 203.56227
## 50 18.002295 223.19581 159.48097
## 51 8.523611 88.70347 101.02071
## 52 5.063346 40.11949 51.46225
## 53 68.773025 938.69085 202.14591
## 54 18.002295 221.68638 158.42167
## 55 8.523611 88.11376 100.34082
## 56 5.063346 39.86187 51.07961
## 57 68.773025 940.58126 202.59800
## 58 18.002295 222.12898 158.70978
## 59 8.523611 88.28668 100.53201
## 60 5.063346 39.93741 51.19182
## 61 68.773025 940.04191 202.40535
## 62 18.002295 222.00271 158.62759
## 63 8.523611 88.23735 100.47747
## 64 5.063346 39.91586 51.15981
## 65 68.773025 938.83594 202.17378
## 66 18.002295 221.72035 158.44378
## 67 8.523611 88.12704 100.35549
## 68 5.063346 39.86767 51.08822
## 69 68.773025 939.18970 202.24171
## 70 18.002295 221.80317 158.49771
## 71 8.523611 88.15940 100.39127
## 72 5.063346 39.88180 51.10922
## 73 68.773025 923.39357 198.94043
## 74 18.002295 218.10476 155.86004
## 75 8.523611 86.71449 98.73771
## 76 5.063346 39.25058 50.43868
## 77 68.773025 940.92339 202.66374
## 78 18.002295 222.20909 158.76192
## 79 8.523611 88.31798 100.56661
## 80 5.063346 39.95108 51.21213
## 81 68.773025 940.92339 202.66374
## 82 18.002295 222.20909 158.76192
## 83 8.523611 88.31798 100.56661
## 84 5.063346 39.95108 51.21213
## 85 68.773025 940.92339 202.66374
## 86 18.002295 222.20909 158.76192
## 87 8.523611 88.31798 100.56661
## 88 5.063346 39.95108 51.21213
dataset_selected = "gtex:whole.blood"
model_selected = "Stretched exponential (by linear fit)"
results_analytical_gtex_wb = topology_results_selected_analytical_norm_df %>%
filter((model == model_selected) & (type_dataset == dataset_selected)) %>%
select(model, type_dataset, a, b, L, max_value_in_dataset) %>% unique()
sample_size_correlation_gtex_wb = sample_size_correlation_df %>%
filter((dataset == dataset_selected) & (!(type_correlation == "weak")))
predictions_convergence_gtex_wb_norm = calculate_predictions_using_stretched_exponential_model_optimized(x=sample_size_correlation_gtex_wb$sample_size_statistical_corrected, L=results_analytical_gtex_wb$L, a=results_analytical_gtex_wb$a, b=results_analytical_gtex_wb$b)
predictions_convergence_gtex_wb_df = data.frame(size=round(sample_size_correlation_gtex_wb$sample_size_statistical_corrected), num_edges=predictions_convergence_gtex_wb_norm * results_analytical_gtex_wb$max_value_in_dataset, num_edges_norm=predictions_convergence_gtex_wb_norm)
predictions_convergence_gtex_wb_df$num_edges_norm = (predictions_convergence_gtex_wb_df$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
predictions_convergence_gtex_wb_df
## size num_edges num_edges_norm
## 1 218 42782199 0.37291614
## 2 87 19008527 0.16569009
## 3 39 5649757 0.04924678
predicted_results_wb_df = predicted_results_df %>% filter((model == model_selected) & (type_dataset == dataset_selected)) %>% mutate_at(c('model_result'), as.numeric)
predicted_results_wb_df$model_result_norm = (predicted_results_wb_df$model_result/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
#mtcars_mod = mtcars %>% mutate_at(c("carb"), as.character)
#mtcars_mod %>% ggplot(aes(mpg, wt)) + geom_point(aes(col=carb, size=gear))
dataset_selected = "Whole.Blood"
all_topology_results_file = '/home/j.aguirreplans/Projects/Scipher/SampleSize/scripts/SampleSizeShiny/data/analysis_topology.csv'
threshold_selected = 0.05
all_results_df = fread(all_topology_results_file)
results_gtex_wb = all_results_df %>%
filter((type_dataset == dataset_selected) & (threshold == threshold_selected) & (type_correlation %in% c("weak-moderate-strong-very strong", "moderate-strong-very strong", "strong-very strong", "very strong")))
results_gtex_wb$num_edges_norm = (results_gtex_wb$num_edges/results_analytical_gtex_wb$max_value_in_dataset)/results_analytical_gtex_wb$L
correlation_levels_plot_df = results_gtex_wb %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "weak-moderate-strong-very strong", "\u2265 0.2")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "moderate-strong-very strong", "\u2265 0.4")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "strong-very strong", "\u2265 0.6")) %>%
mutate(type_correlation = replace(type_correlation, type_correlation == "very strong", "\u2265 0.8")) %>%
full_join((predicted_results_wb_df %>% filter(size <= max((all_results_df %>% filter((type_dataset == dataset_selected) & (threshold == threshold_selected)))$size)) %>% mutate(type_correlation="Model prediction")), by=c("type_dataset", "size", "type_correlation", "num_edges_norm"="model_result_norm")) %>%
full_join((predictions_convergence_gtex_wb_df %>% mutate(type_correlation="Convergence point")), by=c("size", "num_edges", "num_edges_norm", "type_correlation")) %>%
left_join( (name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("type_correlation")), by=c("type_correlation") )
correlation_levels_plot = correlation_levels_plot_df %>%
ggplot(aes(x=size, y=num_edges_norm)) +
#geom_point(data=.%>% filter(!(type_correlation %in% c("Convergence point", "Model prediction"))), aes(fill=factor(type_correlation), size=num_edges_norm), alpha=0.5, shape=21) +
geom_point(data=.%>% filter(!(type_correlation %in% c("Convergence point", "Model prediction"))), aes(col=type_correlation, size=num_edges_norm), alpha=0.5) +
scale_color_manual(name = "Correlation strength", values = as.vector(name2color[levels(factor((correlation_levels_plot_df %>% filter(!(type_correlation == "Model prediction")))$type_correlation))])) + # Plot using the dictionary
new_scale_color() +
#geom_point(data=.%>% filter(type_correlation == "Convergence point"), aes(fill=factor(type_correlation)), alpha=1, size=4, shape=21) +
geom_point(data=.%>% filter(type_correlation == "Convergence point"), aes(col=factor(type_correlation)), alpha=1, size=3) +
scale_color_manual(name = NULL, values = c("red")) + # Plot using the dictionary
new_scale_color() +
geom_line(data=.%>% filter(type_correlation == "Model prediction"), aes(col=type_correlation), linewidth=1) +
scale_color_manual(name = NULL, values = c("red")) + # Plot using the dictionary
#scale_x_continuous(trans = scales::log10_trans()) +
#scale_color_manual(values = c("#D55E00", "#E69F00", "#44AA99", "#0072B2", "#56B4E9")) +
theme_linedraw() +
xlab("Num. samples") +
#xlab("log(N)") +
ylab("Frac. significant correlations") +
guides(col=guide_legend(override.aes = list(alpha = 1, size = 3)), col=guide_legend(title=NULL), size=guide_legend(title="Frac. sig. correlations")) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica"))
plot_file = paste(plots_dir, "correlation_levels_pearson_pval_0.05_data_gtexwholeblood.png", sep="/")
ggsave(
plot_file,
plot = correlation_levels_plot,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
print(correlation_levels_plot)
Read data:
input_data_file = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/analysis_specific_edges_pearson_gtex_Whole.Blood.txt"
input_df = fread(input_data_file)
input_df$link = paste(input_df$Node.1, input_df$Node.2, sep="---")
input_df = input_df %>% mutate(pval.adj = replace(pval.adj, pval.adj < 0.000001, 0.000001))
head(input_df)
## Node.1 Node.2 score pval pval.adj method size rep
## 1: TRBV13 CACUL1 0.00799006 8.282168e-01 1 pearson 740 1
## 2: MARCKSL1 WSB1 0.18809614 2.547269e-07 1 pearson 740 1
## 3: CENPK STOML1 0.11626154 1.534606e-03 1 pearson 740 1
## 4: CDK5R1 TTLL1 0.04031572 2.733863e-01 1 pearson 740 1
## 5: FAM86B3P SORD2P 0.08008447 2.937999e-02 1 pearson 740 1
## 6: YBX1 STARD3 0.11214127 2.250180e-03 1 pearson 740 1
## link
## 1: TRBV13---CACUL1
## 2: MARCKSL1---WSB1
## 3: CENPK---STOML1
## 4: CDK5R1---TTLL1
## 5: FAM86B3P---SORD2P
## 6: YBX1---STARD3
Calculate mean and SD of correlation among replicates:
input_mean_df = input_df %>%
group_by(link, method, size) %>%
summarize(mean_score = mean(abs(score)), sd_score = sd(abs(score)), min_score=min(abs(score)), max_score=max(abs(score)), mean_pval = mean(pval.adj), sd_pval = sd(pval.adj), min_pval=min(pval.adj), max_pval=max(pval.adj)) %>%
ungroup()
## `summarise()` has grouped output by 'link', 'method'. You can override using
## the `.groups` argument.
head(input_mean_df)
## # A tibble: 6 × 11
## link method size mean_…¹ sd_sc…² min_s…³ max_s…⁴ mean_…⁵ sd_pval min_p…⁶
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ABHD1---… pears… 20 0.355 0.239 0.0684 0.692 1 0 1
## 2 ABHD1---… pears… 40 0.429 0.165 0.171 0.590 1 0 1
## 3 ABHD1---… pears… 60 0.315 0.0631 0.212 0.362 1 0 1
## 4 ABHD1---… pears… 80 0.430 0.0472 0.369 0.485 1 0 1
## 5 ABHD1---… pears… 100 0.313 0.122 0.172 0.495 1 0 1
## 6 ABHD1---… pears… 120 0.377 0.0677 0.294 0.481 1 0 1
## # … with 1 more variable: max_pval <dbl>, and abbreviated variable names
## # ¹mean_score, ²sd_score, ³min_score, ⁴max_score, ⁵mean_pval, ⁶min_pval
Select the links of different correlation levels:
set.seed(2106)
max_size_links_df = input_mean_df %>%
filter(size == max(size)) %>%
mutate(type_correlation="Very Weak") %>%
mutate(type_correlation = replace(type_correlation, abs(mean_score) >= 0.2, "Weak")) %>%
mutate(type_correlation = replace(type_correlation, abs(mean_score) >= 0.4, "Moderate")) %>%
mutate(type_correlation = replace(type_correlation, abs(mean_score) >= 0.6, "Strong")) %>%
mutate(type_correlation = replace(type_correlation, abs(mean_score) >= 0.8, "Very Strong"))
num_links_selected = 1
specific_links_df = max_size_links_df %>% select(link, type_correlation, mean_score) %>% filter(type_correlation == "Very Weak") %>% sample_n(size=num_links_selected, replace = FALSE)
specific_links_df = rbind(specific_links_df, max_size_links_df %>% select(link, type_correlation, mean_score) %>% filter(type_correlation == "Weak") %>% filter((abs(mean_score) > 0.25) & (abs(mean_score) < 0.35)) %>% sample_n(size=num_links_selected, replace = FALSE))
specific_links_df = rbind(specific_links_df, max_size_links_df %>% select(link, type_correlation, mean_score) %>% filter(type_correlation == "Moderate") %>% filter((abs(mean_score) > 0.45) & (abs(mean_score) < 0.55)) %>% sample_n(size=num_links_selected, replace = FALSE))
specific_links_df = rbind(specific_links_df, max_size_links_df %>% select(link, type_correlation, mean_score) %>% filter(type_correlation == "Strong") %>% filter((abs(mean_score) > 0.65) & (abs(mean_score) < 0.75)) %>% sample_n(size=num_links_selected, replace = FALSE))
specific_links_df = rbind(specific_links_df, max_size_links_df %>% select(link, type_correlation, mean_score) %>% filter(type_correlation == "Very Strong") %>% sample_n(size=num_links_selected, replace = FALSE))
head(specific_links_df)
## # A tibble: 5 × 3
## link type_correlation mean_score
## <chr> <chr> <dbl>
## 1 ZFPL1---FTH1P16 Very Weak 0.0433
## 2 GNPAT---THNSL2 Weak 0.305
## 3 HDHD3---VASP Moderate 0.478
## 4 TTC9C---CDK3 Strong 0.700
## 5 TSC22D2---UBR5 Very Strong 0.865
Make the plot:
specific_links_plot_df = input_mean_df %>%
inner_join((specific_links_df %>% select(link, type_correlation)), by=c("link")) %>%
mutate(max_pval_discrete=if_else(max_pval >= 0.05, "P. adj. \u2265 0.05", "P. adj. < 0.05")) %>%
mutate(max_pval_size=if_else(max_pval >= 0.05, "P. adj. \u2265 0.05", if_else(max_pval >= 0.01, "P. adj. < 0.05", if_else(max_pval >= 0.001, "P. adj. < 0.01", "P. adj. < 0.001")))) %>%
mutate(type_correlation = factor(type_correlation, levels = c("Very Strong", "Strong", "Moderate", "Weak", "Very Weak")))
# name2size <- c(
# "P. adj. \u2265 0.05" = 0.1,
# "P. adj. < 0.05" = 1,
# "P. adj. < 0.01" = 1.5,
# "P. adj. < 0.001" = 2
# )
specific_links_plot = specific_links_plot_df %>%
ggplot(aes(x=size, y=abs(mean_score))) +
geom_line(aes(col=type_correlation)) +
#geom_point(data=(input_df %>% filter(link %in% specific_links)), aes(x = size, y = abs(score), col=link), alpha=0.2) +
geom_ribbon(aes(ymin = abs(min_score), ymax = abs(max_score), fill=type_correlation), alpha=0.3) +
#geom_point(aes(size=max_pval_size), col="red") +
#scale_color_manual(name = "Correlation strength", values = c("#ff7b00", "#F8766D", "#7CAE00", "#00BFC4", "#C77CFF")) +
scale_color_manual(name = "Correlation strength", values = as.vector(name2color[levels(factor(specific_links_plot_df$type_correlation))])) + # Plot using the dictionary
#scale_fill_manual(name = "Correlation strength", values = c("#ff7b00", "#F8766D", "#7CAE00", "#00BFC4", "#C77CFF")) +
scale_fill_manual(name = "Correlation strength", values = as.vector(name2color[levels(factor(specific_links_plot_df$type_correlation))])) + # Plot using the dictionary
new_scale_color() +
geom_point(data = ~filter(.x, max_pval < 0.05), aes(x=size, y=abs(mean_score), col=max_pval_discrete)) +
scale_color_manual(name = NULL, values = c("red")) +
#geom_errorbar(aes(ymin=abs(mean_score)-sd_score, ymax=abs(mean_score)+sd_score), width=.2, position=position_dodge(0.05)) +
theme_linedraw() +
xlab("Num. samples") +
ylab("Pearson correlation (abs)") +
#scale_size_manual(values = as.vector(name2size[levels(factor(specific_links_plot_df$max_pval_size))])) +
#scale_color_manual(values = c("#C77CFF", "#7CAE00", "#FF7B00", "#F8766D", "#00BFC4")) +
#scale_fill_manual(values = c("#C77CFF", "#7CAE00", "#FF7B00", "#F8766D", "#00BFC4")) +
#guides(col=guide_legend(title="Correlation strength"), fill=guide_legend(title="Correlation strength")) +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica"))
plot_file = paste(plots_dir, "specific_links_gtex_Whole.Blood.png", sep="/")
ggsave(
plot_file,
plot = specific_links_plot,
dpi = 1200,
#width = 9000,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(specific_links_plot)
Read data:
sd_table_file = paste(tables_dir, "analysis_sd_coexpression_weight_gtex_Whole.Blood.txt", sep="/")
sd_table_df = fread(sd_table_file)
Plot:
# Define palette using the scale_fill_brewer Oranges palette and selecting manually the colors using RColorBrewer
# https://stackoverflow.com/questions/29466239/how-to-set-the-color-range-of-scale-colour-brewer-in-ggplot2-palette-selecte
my_orange = RColorBrewer::brewer.pal(n = (length(unique(sd_table_df$ss)) + 2), "Oranges")[3:(length(unique(sd_table_df$ss))+2)] # I get 8 and exclude the 2 first colors, which are more light
sd_plot_file = paste(plots_dir, "analysis_sd_coexpression_weight_gtex_Whole.Blood.png", sep="/")
sd_plot = sd_table_df %>%
mutate(ss = factor(ss, levels = as.character(sort(as.numeric(unique(sd_table_df$ss)))))) %>%
ggplot(aes(x = ss, y = sd_correlation)) +
#geom_half_violin(side = "r", color="orangered4", fill="orangered1") +
geom_half_violin(aes(fill=ss), color="transparent", side = "r") +
#scale_fill_brewer(palette="Oranges") +
scale_fill_manual(values=my_orange) +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
labs (x = "Num. samples",
y = "Pearson correlation SD",
color = "",
fill = "")
ggsave(
sd_plot_file,
plot = sd_plot,
dpi = 1200,
#width = 9300,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(sd_plot)
rm(sd_table_df)
Patchwork:
# Use patchwork to concatenate plots
(correlation_levels_plot /
specific_links_plot /
sd_plot) +
plot_annotation(tag_levels = 'A') & # Include letters
theme(plot.tag = element_text(face = 'bold', size = 17))
plot_file = paste(plots_dir, "/sd_coexpression_results.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 10000,
height = 17000,
units = c("px")
)
Supplementary figure with the analysis of SD for different datasets:
datasets_analysis = c("tcga_TCGA-BRCA_female", "tcga_TCGA-LUAD")
datasets_analysis_titles = c("TCGA: Breast cancer", "TCGA: Lung cancer")
sd_plots = list()
for (x in 1:length(datasets_analysis)){
dataset_selected = datasets_analysis[x]
sd_table_file = paste(tables_dir, "/analysis_sd_coexpression_weight_", dataset_selected, ".txt", sep="")
sd_table_df = fread(sd_table_file)
sd_plot_file = paste(plots_dir, "/analysis_sd_coexpression_weight_", dataset_selected, ".png", sep="")
my_orange = RColorBrewer::brewer.pal(n = (length(unique(sd_table_df$ss)) + 2), "Oranges")[3:(length(unique(sd_table_df$ss))+2)]
sd_plot = sd_table_df %>%
mutate(ss = factor(ss, levels = as.character(sort(as.numeric(unique(sd_table_df$ss)))))) %>%
ggplot(aes(x = ss, y = sd_correlation)) +
#geom_half_violin(side = "r", color="orangered4", fill="orangered1") +
geom_half_violin(aes(fill=ss), color="transparent", side = "r") +
#scale_fill_brewer(palette="Oranges") +
scale_fill_manual(values=my_orange) +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
labs (x = "Num. samples",
y = "Pearson correlation SD",
title = datasets_analysis_titles[[x]],
color = "",
fill = "")
sd_plots[[x]] = sd_plot
ggsave(
sd_plot_file,
plot = sd_plot,
dpi = 1200,
width = 6500,
height = 6500,
units = c("px")
)
print(sd_plot)
rm(sd_table_df)
}
Same plots but for small sample sizes:
datasets_analysis = c("tcga_TCGA-BRCA_female", "tcga_TCGA-LUAD")
datasets_analysis_titles = c("TCGA: Breast cancer", "TCGA: Lung cancer")
sd_plots_small_sizes = list()
for (x in 1:length(datasets_analysis)){
dataset_selected = datasets_analysis[x]
sd_table_file = paste(tables_dir, "/analysis_sd_coexpression_weight_with_small_sizes_", dataset_selected, ".txt", sep="")
sd_table_df = fread(sd_table_file)
sd_plot_file = paste(plots_dir, "/analysis_sd_coexpression_weight_with_small_sizes_", dataset_selected, ".png", sep="")
my_orange = RColorBrewer::brewer.pal(n = (length(unique(sd_table_df$ss)) + 2), "Oranges")[3:(length(unique(sd_table_df$ss))+2)]
sd_plot_small_sizes = sd_table_df %>%
mutate(ss = factor(ss, levels = as.character(sort(as.numeric(unique(sd_table_df$ss)))))) %>%
ggplot(aes(x = ss, y = sd_correlation)) +
#geom_half_violin(side = "r", color="orangered4", fill="orangered1") +
geom_half_violin(aes(fill=ss), color="transparent", side = "r") +
#scale_fill_brewer(palette="Oranges") +
scale_fill_manual(values=my_orange) +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
text = element_text(family = "Helvetica"),
legend.position="none") +
labs (x = "Num. samples",
y = "Pearson correlation SD",
title = datasets_analysis_titles[[x]],
color = "",
fill = "")
sd_plots_small_sizes[[x]] = sd_plot_small_sizes
ggsave(
sd_plot_file,
plot = sd_plot_small_sizes,
dpi = 1200,
width = 6500,
height = 6500,
units = c("px")
)
print(sd_plot_small_sizes)
rm(sd_table_df)
}
Patchwork for the supplementary figure:
layout <- "
AB
"
patch_breast = ((sd_plots_small_sizes[[1]] + labs(tag = 'A', title = NULL)) +
(sd_plots[[1]] + labs(tag = 'B', title = NULL))) +
plot_annotation(title = 'TCGA: Breast cancer', theme=theme(plot.title = element_text(size = 20, face="bold"))) &
theme(plot.tag = element_text(face = 'bold', size = 17),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
layout <- "
CD
"
patch_lung = ((sd_plots_small_sizes[[2]] + labs(tag = 'C', title = NULL)) +
(sd_plots[[2]] + labs(tag = 'D', title = NULL))) +
plot_annotation(title = 'TCGA: Lung cancer', theme=theme(plot.title = element_text(size = 20, face="bold"))) &
theme(plot.tag = element_text(face = 'bold', size = 17),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm"))
(wrap_elements(patch_breast) / wrap_elements(patch_lung))
plot_file = paste(plots_dir, "/analysis_sd_coexpression_weight_SI.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 12000,
height = 14000,
units = c("px")
)
name_D = "tcga.brca.female"
name_N = "tcga.breast.female"
diffcoexp_plots_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/plots_differential_coexpression/TCGA-BRCA_female___TCGA-Breast_female"
diffcoexp_tables_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/tables_differential_coexpression/TCGA-BRCA_female___TCGA-Breast_female"
name2color_df = name2color %>% as.data.frame() %>% dplyr::rename("rgb_col"=".") %>% tibble::rownames_to_column("name")
Read file:
change_of_category_file = paste(diffcoexp_tables_dir, paste("codina_", name_D, "_", name_N, "_changes.txt", sep=""), sep="/")
change_of_category_df = fread(change_of_category_file)
Make plot:
sizes_list = sort(unique(change_of_category_df$size.prev))
if (max(sizes_list) <= 160) {
sizes_to_plot = c(20, 40, 60, 80, 100, 120, 140, 160)
} else if ((max(sizes_list) > 160) & (max(sizes_list) <= 300)) {
sizes_to_plot = c(20, 60, 100, 140, 180, 220, 260, 300)
} else if ((max(sizes_list) > 300) & (max(sizes_list) <= 440)) {
sizes_to_plot = c(20, 80, 140, 200, 260, 320, 380, 440)
} else {
sizes_to_plot = seq(20, max(sizes_list), 100)
}
# Create last column of plot (size 100)
change_of_category_size100_df = change_of_category_df %>%
filter(size.curr == 100) %>%
dplyr::select(Node, Phi_name.curr, size.curr) %>%
dplyr::rename("Phi_name.prev"="Phi_name.curr", "size.prev"="size.curr") %>%
mutate(Phi_name.curr = NA) %>%
mutate(size.curr = NA) %>%
mutate(transition_type = NA) %>%
mutate(transition_name = NA) %>%
mutate(transition_name_simple = Phi_name.prev) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "common", "common (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "disease-specific", "disease-specific (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "normal-specific", "normal-specific (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "undefined", "undefined (constant)")) %>%
left_join(name2color_df, by=c("Phi_name.prev"="name"))
# Count transitions / merge with colors / merge with last column
change_of_category_col_df = change_of_category_df %>%
mutate(transition_name_simple = transition_name) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "common / common", "common (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("common / disease-specific", "common / normal-specific", "common / undefined"), "common (change)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "disease-specific / disease-specific", "disease-specific (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("disease-specific / common", "disease-specific / normal-specific", "disease-specific / undefined"), "disease-specific (change)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "normal-specific / normal-specific", "normal-specific (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("normal-specific / common", "normal-specific / disease-specific", "normal-specific / undefined"), "normal-specific (change)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple == "undefined / undefined", "undefined (constant)")) %>%
mutate(transition_name_simple = replace(transition_name_simple, transition_name_simple %in% c("undefined / disease-specific", "undefined / normal-specific", "undefined / common"), "undefined (change)")) %>%
left_join(name2color_df, by=c("transition_name_simple"="name")) %>%
full_join(change_of_category_size100_df)
## Joining, by = c("Node", "Phi_name.prev", "size.prev", "Phi_name.curr",
## "size.curr", "transition_type", "transition_name", "transition_name_simple",
## "rgb_col")
# Define factors
change_of_category_col_df$transition_name_simple <- factor(change_of_category_col_df$transition_name_simple, levels=c("common (constant)", "common (change)", "disease-specific (constant)", "disease-specific (change)", "normal-specific (constant)", "normal-specific (change)", "undefined (constant)", "undefined (change)"))
change_of_category_col_df$size.prev = as.character(change_of_category_col_df$size.prev)
change_of_category_col_df$size.prev <- factor(change_of_category_col_df$size.prev, levels=as.character(sort(unique(as.integer(change_of_category_col_df$size.prev)))))
# Plot
plot_flow = change_of_category_col_df %>%
filter(size.prev %in% sizes_to_plot) %>%
ggplot(aes(x = size.prev, stratum = transition_name_simple, alluvium = Node,
fill = transition_name_simple, label = transition_name_simple)) +
geom_flow(stat = "alluvium", lode.guidance = "frontback") +
#geom_stratum() +
geom_stratum(aes(col=transition_name_simple), size=0) +
scale_fill_manual(name="Gene category", values=setNames(change_of_category_col_df$rgb_col, change_of_category_col_df$transition_name_simple)) + # If I want to plot only specific elements in the legend, use the argument breaks with a list of the elements that I want to show
scale_color_manual(name="Gene category", values=setNames(change_of_category_col_df$rgb_col, change_of_category_col_df$transition_name_simple)) +
xlab("Num. samples") +
ylab("Num. disease genes") +
guides(fill=guide_legend(title="Gene category")) +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica"))
plot_file = paste(diffcoexp_plots_dir, paste("codina_", name_D, "_", name_N, "_change_disease_genes_improved.png", sep=""), sep="/")
ggsave(
plot_file,
plot = plot_flow,
dpi = 1200,
width = 10000,
height = 6000,
units = c("px")
)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
print(plot_flow)
Calculate stable genes by category:
# Calculate total of stable genes
total_stable_change_genes_df = change_of_category_df %>%
group_by(size.curr) %>%
mutate(n_total = n()) %>%
ungroup() %>%
select(Node, Phi_name.curr, size.curr, transition_type, n_total) %>%
group_by(size.curr, transition_type, n_total) %>%
summarize(n_genes = n()) %>%
ungroup() %>%
mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'size.curr', 'transition_type'. You can
## override using the `.groups` argument.
total_stable_change_genes_df$Phi_name.curr = "all"
# Calculate stable genes by category
categories_stable_change_genes_df = change_of_category_df %>%
group_by(size.curr) %>%
mutate(n_total = n()) %>%
ungroup() %>%
filter(transition_type == "stable") %>%
select(Node, Phi_name.curr, size.curr, transition_type, n_total) %>%
group_by(Phi_name.curr, size.curr, transition_type, n_total) %>%
summarize(n_genes = n()) %>%
ungroup() %>%
mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'Phi_name.curr', 'size.curr',
## 'transition_type'. You can override using the `.groups` argument.
# Merge two tables
stable_genes_df = rbind(total_stable_change_genes_df, categories_stable_change_genes_df) %>%
filter(transition_type == "stable") %>%
left_join(name2color_df, by=c("Phi_name.curr"="name"))
Make plot:
plot_stable_genes = stable_genes_df %>%
ggplot(aes(x = size.curr, y = frac_genes, col = Phi_name.curr)) +
geom_line(aes(size = Phi_name.curr)) +
xlab("Num. samples") +
ylab("Fraction of stable disease genes") +
scale_size_manual(values = c("all" = 1, "common" = 0.5, "disease-specific" = 0.5, "normal-specific" = 0.5, "undefined" = 0.5, "different" = 0.5)) +
scale_color_manual(values=setNames(stable_genes_df$rgb_col, stable_genes_df$Phi_name.curr)) +
guides(col=guide_legend(title="Gene category"), size="none") +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica"))
plot_file = paste(diffcoexp_plots_dir, paste("codina_", name_D, "_", name_N, "_stable_genes.png", sep=""), sep="/")
ggsave(
plot_file,
plot = plot_stable_genes,
dpi = 1200,
width = 9000,
height = 6000,
units = c("px")
)
print(plot_stable_genes)
Calculate ranking of widely known genes:
ranking_nodes_file = paste(diffcoexp_tables_dir, paste("codina_", name_D, "_", name_N, "_nodes_ranked.txt", sep=""), sep="/")
ranking_nodes_df= fread(ranking_nodes_file)
nodes_to_follow_file = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/nodes_to_follow/breast_cancer.txt"
nodes_to_follow = fread(nodes_to_follow_file, header = F)$V1
Make plot:
nodes_to_follow_ranking_plot = ranking_nodes_df %>%
filter(Node %in% nodes_to_follow) %>%
mutate(label=if_else(size==max(size), Node, "")) %>%
ggplot(aes(x = size, y = rank)) +
geom_point(aes(group = Node, col = Phi_name)) +
geom_line(aes(group = Node, col = Phi_name)) +
xlab("Num. samples") +
ylab("Ranking") +
scale_color_manual(values=as.vector(name2color_df[name2color_df$name %in% levels(factor(unique((ranking_nodes_df %>% filter(Node %in% nodes_to_follow))$Phi_name))),]$rgb_col)) +
scale_y_continuous(trans = "reverse") +
guides(col=guide_legend(title="Gene category"), size="none") +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica")) +
geom_label_repel(aes(col=Phi_name, label = label),
fill="white",
box.padding = 0.2, max.overlaps = Inf,
label.size = 0.75,
size = 5,
alpha = 0.9,
label.padding=0.25,
fontface = "bold",
na.rm=TRUE,
show.legend=F,
seed = 1234)
plot_file = paste(diffcoexp_plots_dir, "nodes_to_follow_ranking.png", sep="/")
ggsave(
plot_file,
plot=nodes_to_follow_ranking_plot,
dpi = 1200,
#width = 9500,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
print(nodes_to_follow_ranking_plot)
Use patchwork to plot everything together:
(plot_flow /
nodes_to_follow_ranking_plot /
plot_stable_genes) +
plot_annotation(tag_levels = 'A') & # Include letters
theme(plot.tag = element_text(face = 'bold', size = 17)
,plot.tag.position = c(.05, 1.04)
,plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm")
) # that are bold
plot_file = paste(diffcoexp_plots_dir, "/codina_stability_results.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 10000,
#height = 12000,
height = 17500,
units = c("px")
)
Load PPI + co-expression dataframe:
dataset = "gtex"
type_dataset = "Whole.Blood"
sample_group_type = "tumor" # if dataset = "tcga"
type_counts = "reads" # tpm, reads
sizes_selected = c(20, 100, 200, 300, 400, 500)
sizes_selected_comparison_plot = c(20, 100, 200, 300)
threshold_selected = 0.05
coexpression_pairs_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/ppi_analysis_pairs_pval_%s_%s_%s.txt", as.character(threshold_selected), dataset, type_dataset)
coexpression_pairs_filt = fread(coexpression_pairs_file)
perf_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables/ppi_coexpression_analysis_ppi_vs_random_far_pairs_performance_%s_%s.txt", dataset, type_dataset)
perf_df = fread(perf_file)
colors_defined = c("PPI" = "#DBA9CA",
"PPI & iNCI" = "#FAF69E",
"Random" = "#B8BECC",
"Random pairs in PPI" = "#B8BECC",
"Random (distance > 4)" = "#543F4D",
"Distance > 4" = "#543F4D",
"Random pairs in PPI (dist. > 4)" = "#543F4D",
"Random pairs in PPI & iNCI (dist. > 4)" = "#545334",
"1" = "#DBA9CA",
"2" = "#a8829b",
"3" = "#87677c",
"4" = "#5e4857",
"5" = "#40303b",
"6" = "#1c151a")
Violin plot of PPI vs. pairs > 4 distance across sample size:
violinplot_ppi_coexpression_by_types_nozero_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/ppi_coexpression_violinplot_by_pair_type_nozero_pval_%s_%s_%s.png", as.character(threshold_selected), dataset, type_dataset)
colors_defined_filt = colors_defined[names(colors_defined) %in% c("PPI", "Distance > 4")]
violinplot_ppi_coexpression_by_types_nozero = coexpression_pairs_filt %>%
select(-distances_ppi, -distances_nci, -nci, -random_not_ppi, -random_far_not_nci) %>%
pivot_longer(c("ppi", "random_far_not_ppi"), names_to = "category", values_to = "category_bool") %>%
filter(category_bool == TRUE) %>%
select(-category_bool) %>%
pivot_longer(as.character(sizes_selected), names_to = "size", values_to = "correlation", names_transform = list(size = as.character), values_transform = list(correlation = as.numeric)) %>%
filter(size %in% sizes_selected_comparison_plot) %>%
filter(!(is.na(correlation))) %>%
filter(abs(correlation) > 0 ) %>%
mutate(category = replace(category, category == "ppi", "PPI")) %>%
mutate(category = replace(category, category == "random_far_not_ppi", "Distance > 4")) %>%
mutate(category = factor(category, levels = c("PPI", "Distance > 4"))) %>%
ggplot() +
aes(x = reorder(size, as.numeric(size)),
y = abs(correlation),
fill = category,
colour = category) +
geom_violin(alpha = 0.8,
scale = "width",
adjust = 0.5) +
scale_fill_manual(values = colors_defined_filt) +
scale_color_manual(values = colors_defined_filt) +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica")) +
labs (x = "Number of samples",
y = "Absolute co-expression weight",
color = "",
fill = ""); violinplot_ppi_coexpression_by_types_nozero
ggsave(
violinplot_ppi_coexpression_by_types_nozero_file,
plot = violinplot_ppi_coexpression_by_types_nozero,
dpi = 1200,
#width = 9300,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
Violin plot of PPI distances across sample size:
violinplot_ppi_coexpression_by_distances_nozero_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/ppi_coexpression_violinplot_by_ppi_distance_nozero_pval_%s_%s_%s.png", as.character(threshold_selected), dataset, type_dataset)
colors_defined_filt = colors_defined[names(colors_defined) %in% c("1", "2", "3", "4", "5", "6")]
violinplot_ppi_coexpression_by_distances_nozero = coexpression_pairs_filt %>%
select(-ppi, -nci, -distances_nci, -random_not_ppi, -random_far_not_ppi, -random_far_not_nci) %>%
filter(!(is.na(distances_ppi))) %>%
pivot_longer(as.character(sizes_selected), names_to = "size", values_to = "correlation", names_transform = list(size = as.character), values_transform = list(correlation = as.numeric)) %>%
filter(size %in% sizes_selected_comparison_plot) %>%
filter(!(is.na(correlation))) %>%
filter(abs(correlation) > 0 ) %>%
mutate(across(c(distances_ppi), as.character)) %>%
ggplot() +
aes(x = reorder(size, as.numeric(size)),
y = abs(correlation),
fill = distances_ppi,
colour = distances_ppi) +
geom_violin(alpha = 0.8,
scale = "width",
adjust = 0.5) +
scale_fill_manual(values = colors_defined_filt) +
scale_color_manual(values = colors_defined_filt) +
theme_linedraw() +
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica")) +
labs (x = "Number of samples",
y = "Absolute co-expression weight",
color = "PPI distance",
fill = "PPI distance")
ggsave(
violinplot_ppi_coexpression_by_distances_nozero_file,
plot = violinplot_ppi_coexpression_by_distances_nozero,
dpi = 1200,
#width = 9300,
#height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
violinplot_ppi_coexpression_by_distances_nozero
AUC of PPI prediction:
perf_auc = perf_df %>%
select(size, AUC) %>%
unique()
print(perf_auc)
## size AUC
## 1: 20 0.6718979
## 2: 100 0.6992475
## 3: 200 0.6962787
## 4: 300 0.7063418
## 5: 400 0.7090533
## 6: 500 0.7072046
auc_plot = perf_auc %>%
mutate(size = factor(size,
levels = sizes_selected,
labels = sizes_selected)) %>%
ggplot() +
aes(x = size,
#fill = base,
#colour = base,
weight = AUC) +
geom_bar(col="#DBA9CA",
fill="#DBA9CA") +
#scale_fill_manual(values = colors_defined) +
#scale_color_manual(values = colors_defined) +
coord_cartesian(ylim=c(0.5,NA)) +
theme_linedraw() +
#facet_grid(vars(name))+
theme(aspect.ratio=1,
plot.title = element_text(size = 20, face="bold"),
axis.title = element_text(size = 17, face="bold"),
axis.text = element_text(size = 16),
panel.grid.major=element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 16),
legend.title=element_text(size=16, face="bold"),
text = element_text(family = "Helvetica")) +
labs(y = "AUROC",
x = "Number of samples"
)
auc_plot_file = sprintf("/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots/ppi_coexpression_perf_AUC_pval_%s_%s_%s.png", as.character(threshold_selected), dataset, type_dataset)
ggsave(
auc_plot_file,
plot = auc_plot,
dpi = 1200,
#width = 9300,
height = 6000,
units = c("px")
)
## Saving 8400 x 6000 px image
auc_plot
Patchwork (filtered by p-value):
# Use patchwork to concatenate plots
((violinplot_ppi_coexpression_by_types_nozero + theme(legend.position="bottom")) |
(violinplot_ppi_coexpression_by_distances_nozero + theme(legend.position="bottom")) |
auc_plot) +
plot_annotation(tag_levels = 'A') &
theme(plot.tag = element_text(face = 'bold', size = 17),
plot.tag.position = c(.01, 1.06),
plot.margin = margin(0.6, 0.6, 0.6, 0.6, "cm")
)
plot_file = paste(plots_dir, "/ppi_coexpression_results.png", sep="")
ggsave(
plot_file,
dpi = 1200,
width = 20000,
height = 9000,
units = c("px")
)